diff --git a/analyze.py b/analyze.py index 5e756c9..9ffc63d 100644 --- a/analyze.py +++ b/analyze.py @@ -3,6 +3,53 @@ import requests from math import radians, cos, sin, acos, asin, sqrt from pathlib import Path from os import chdir +from datetime import datetime, timezone +from dateutil import tz +import pickle + +class IPCache: + picklefile_name = 'cached_ips.pkl' + + def __init__(self, /): + with Path('.') as p: + self.creation_dir = p.resolve() + + def get(self, ip, /): + if ip in self.cache: + return self.cache[ip] + addr = f'http://ip-api.com/json/{ip}' + response = requests.get(addr) + if not response.ok: + raise RuntimeError(f'request for ip failed with {response.status_code}') + resulting_dict = eval(response.content) + if resulting_dict['status'] == 'fail': + raise RuntimeError(f'ip was invalid') + # the given timezone is like, 'Australia/Sydney'. we need to convert to + # a datetime.timezone type + timezone_str = resulting_dict['timezone'] + tzfile = tz.gettz(timezone_str) + as_timedelta = tzfile.utcoffset(datetime.utcnow()) + as_timezone_type = timezone(as_timedelta) + + self.cache[ip] = ((resulting_dict['lat'], resulting_dict['lon']), as_timezone_type) + return self.cache[ip] + + def __enter__(self, filename=picklefile_name, /): + chdir(self.creation_dir) + with Path(filename) as p: + if p.exists(): + with open(filename, 'rb') as f: + cache = pickle.load(f) + else: + cache = {} + self.filename = filename + self.cache = cache + return self + + def __exit__(self, err_type, err_value, traceback, /): + chdir(self.creation_dir) + with open(self.filename, 'wb') as f: + pickle.dump(self.cache, f) def filter_logs(filename: str): with open(filename, 'r') as f: @@ -13,24 +60,8 @@ def filter_logs(filename: str): if ( status != '200' or method != 'GET' - or file not in validnames ): continue - date, time, timezone = timethings.split(' ') - yield ip, date, time, timezone, status, method, file - -def get_ip_latlon(ip: str) -> (int, int): - # make a reqest to ip-api.com to associate an ip to a - # latitude and longitude - addr = f'http://ip-api.com/json/{ip}' - response = requests.get(addr) - if not response.ok: - raise RuntimeError(f'request for ip failed with {response.status_code}') - resulting_dict = eval(response.content) - if resulting_dict['status'] != 'fail': - return resulting_dict['lat'], resulting_dict['lon'] - -def value_sort(d: dict, reverse=True) -> dict: - return sorted(d.items(), key=lambda kv_pair: kv_pair[1], reverse=reverse) + yield ip, timethings, status, method, file def latlon_distance(p1, p2) -> float: # black magic do not touch. use the haversine formula to find the distance @@ -54,45 +85,31 @@ def analyze_server(server: Path, serverip: str) -> None: if not server.is_dir(): return + result = '' + filename = f'{server.name}/access.log' - hitfiles = {} - requesters = {} - distances = {} - times = {} - self_latlon = get_ip_latlon(serverip) - for ip, date, time_str, timezone, status, method, file in filter_logs(filename): - if file in hitfiles: - hitfiles[file] += 1 - else: - hitfiles[file] = 1 + self_latlon, _ = ip_cache.get(serverip) + for ip, timethings, status, method, file in filter_logs(filename): + # get_ip_latlon_tz() returns the latitude, longitude, and timezone + # of an ip + # TODO: cache results in a picklefile + latlon, timezone = ip_cache.get(ip) - if ip in requesters: - requesters[ip] += 1 - else: - requesters[ip] = 1 - latlon = get_ip_latlon(ip) - if latlon is not None: - distances[ip] = latlon_distance(self_latlon, latlon) + # convert `timethings` to a datetime object + time_of_hit = datetime.strptime(timethings, log_date_format) + # convert from its default timezone in GMT to the timezone of the requester + localtime = time_of_hit.replace(tzinfo=timezone.utc).astimezone(timezone) + # convert this time back to a string for logging purposes + localtime_str = localtime.strftime(log_date_format) - hour = time_str.split(':')[0] - if hour in times: - times[hour] += 1 - else: - times[hour] = 1 - - print(f'\n\n--- ANALYSIS FOR {server.name.upper()} ---\n') - - for dict_name in ['hitfiles', 'requesters', 'distances', 'times']: - print( - dict_name + ': {\n ' - + ',\n '.join( - f'{k!r}: {v!r}' - for k, v in value_sort(eval(dict_name)) - ) - + '\n}' + distance = latlon_distance(self_latlon, latlon) + date, time = timethings.split(' ')[:2] + result += ( + f'{server.name},{ip},{distance},{date},{time},' + + f'{localtime_str},{method},{file}\n' ) - print(f'average: {sum(distances.values())/len(distances)}') + return result def get_server_ip(servername: str) -> str: # associate servers with ips @@ -115,45 +132,24 @@ def main(args: list) -> int: print('no logdir provided') return 1 + outfile = 'analysis.csv' start_dir = Path('.').resolve() + f = open(outfile, 'a') for logdir in args[1:]: chdir(logdir) serverdir = Path('.') for subdir in serverdir.iterdir(): serverip = get_server_ip(subdir.name) - analyze_server(subdir, serverip) + csv_lines = analyze_server(subdir, serverip) chdir(start_dir) + f.write(csv_lines) + f.close() return 0 -validnames = { - 'wp-login.php', - '.env', - 'plugins/system/debug/debug.xml', - 'administrator/language/en-GB/en-GB.xml', - 'administrator/help/en-GB/toc.json', - '.git/config', - 'vendor/phpunit/phpunit/src/Util/PHP/eval-stdin.php', - 'xmlrpc.php', - 'wp1/wp-includes/wlwmanifest.xml', - 'wp/wp-includes/wlwmanifest.xml', - 'wordpress/wp-includes/wlwmanifest.xml', - 'web/wp-includes/wlwmanifest.xml', - 'test/wp-includes/wlwmanifest.xml', - 'site/wp-includes/wlwmanifest.xml', - 'shop/wp-includes/wlwmanifest.xml', - 'cms/wp-includes/wlwmanifest.xml', - 'blog/wp-includes/wlwmanifest.xml', - '2019/wp-includes/wlwmanifest.xml', - 'wp-load.php', - 'public/_ignition/health-check', - '_ignition/health-check', - 'admin/.env', - 'protected/.env', - 'wp-includes/wp-class.php', - 'wp-commentin.php', - 'wp-signin.php' -} +log_date_format = r'%y/%m/%d %H:%M:%S %z' + if __name__ == '__main__': - sys.exit(main(sys.argv)) \ No newline at end of file + with IPCache() as ip_cache: + sys.exit(main(sys.argv)) \ No newline at end of file