diff --git a/create-athena-org-table.sql b/create-athena-org-table.sql index 4595b97..a8c05fa 100644 --- a/create-athena-org-table.sql +++ b/create-athena-org-table.sql @@ -1,45 +1,60 @@ -- https://github.com/duo-labs/cloudtrail-partitioner/issues/14 --- aws --profile foo s3 ls s3://cloudtrail-bucket/AWSLogs/o-orgid/ | cut -c 32-50 | tr / , | xargs | tr -d ' ' +-- aws s3 ls s3://BUCKETNAME/AWSLogs/o-ORGNAME/ | cut -c 32-50 | tr / , | xargs | tr -d ' ' +-- To use this: +-- 1. Find all the comments below that tell you "Edit the next line" and do what they say. +-- 2. Go to Amazon Athena and copy paste this SQL into the SQL box and execute it +-- 3. You'll now have a table named `orgtrail`. Do a quick search to verify it: +-- SELECT * from `orgtrail` limit 5; CREATE EXTERNAL TABLE orgtrail ( - `eventversion` string, - `useridentity` struct,sessionissuer:struct>>, - `eventtime` string, - `eventsource` string, - `eventname` string, - `awsregion` string, - `sourceipaddress` string, - `useragent` string, - `errorcode` string, - `errormessage` string, - `requestparameters` string, - `responseelements` string, - `additionaleventdata` string, - `requestid` string, - `eventid` string, - `resources` array>, - `eventtype` string, - `apiversion` string, - `readonly` string, - `recipientaccountid` string, - `serviceeventdetails` string, - `sharedeventid` string, - `vpcendpointid` string) + `eventtime` string, + `eventsource` string, + `eventname` string, + `awsregion` string, + `sourceipaddress` string, + `useragent` string, + `errorcode` string, + `errormessage` string, + `requestparameters` string, + `responseelements` string, + `additionaleventdata` string, + `requestid` string, + `eventid` string, + `eventtype` string, + `apiversion` string, + `readonly` string, + `recipientaccountid` string, + `serviceeventdetails` string, + `sharedeventid` string, + `vpcendpointid` string, + `eventversion` string, + `resources` array>, + `useridentity` struct,sessionissuer:struct>> +) COMMENT 'CloudTrail table with projection for future partitions' PARTITIONED BY (accountId string, region string, date string) -ROW FORMAT SERDE 'com.amazon.emr.hive.serde.CloudTrailSerde' -STORED AS INPUTFORMAT 'com.amazon.emr.cloudtrail.CloudTrailInputFormat' +ROW FORMAT SERDE 'com.amazon.emr.hive.serde.CloudTrailSerde' +STORED AS INPUTFORMAT 'com.amazon.emr.cloudtrail.CloudTrailInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' +-- Edit the next line to change BUCKETNAME and o-ORGNAME to your bucket name and your Org ID LOCATION 's3://BUCKETNAME/AWSLogs/o-ORGNAME' TBLPROPERTIES( - "projection.enabled" = "true", - "projection.date.type" = "date", - "projection.date.range" = "2021/01/01,NOW", - "projection.date.format" = "yyyy/MM/dd", - "projection.date.interval" = "1" , + "projection.enabled" = "true", + "projection.date.type" = "date", +-- Edit the next line if you want to make queries more or less efficient. Setting the date range +-- to a more recent date means records from earlier dates will be inaccessible, but it will speed +-- up querying. Set it to the oldest date you need to access. + "projection.date.range" = "2021/01/01,NOW", + "projection.date.format" = "yyyy/MM/dd", + "projection.date.interval" = "1" , "projection.accountid.type" = "enum", +-- Edit the next line to insert each 12-digit AWS account ID that you are logging for. Yes, if you +-- get a new AWS account, you have to destroy the table definition, add that ID to this list, and +-- recreate the table. It's clunky, but it works. "projection.accountid.values" = "111111111111,222222222222,333333333333", - "projection.region.type" = "enum", - "projection.region.values" = "ap-northeast-1,ap-northeast-2,ap-northeast-3,ap-south-1,ap-southeast-1,ap-southeast-2,ca-central-1,eu-central-1,eu-north-1,eu-west-1,eu-west-2,eu-west-3,sa-east-1,us-east-1,us-east-2,us-west-1,us-west-2", + "projection.region.type" = "enum", +-- Edit the next line to have the regions you care about + "projection.region.values" = "ap-northeast-1,ap-northeast-2,ap-northeast-3,ap-south-1,ap-southeast-1,ap-southeast-2,ca-central-1,eu-central-1,eu-north-1,eu-west-1,eu-west-2,eu-west-3,sa-east-1,us-east-1,us-east-2,us-west-1,us-west-2", +-- Edit the next line to change BUCKETNAME and o-ORGNAME to your bucket name and your Org ID "storage.location.template" = "s3://BUCKETNAME/AWSLogs/o-ORGNAME/${accountid}/CloudTrail/${region}/${date}" );