import sys import requests from math import radians, cos, sin, acos, asin, sqrt from pathlib import Path from os import chdir from datetime import datetime, timezone from dateutil import tz import pickle from time import sleep class IPCache: picklefile_name = 'cached_ips.pkl' def __init__(self, /): with Path('.') as p: self.creation_dir = p.resolve() def get(self, ip, /): if ip in self.cache: return self.cache[ip] print(f'{ip} not in cache') addr = f'http://ip-api.com/json/{ip}' attempts = 1 max_attempts = 5 while attempts <= max_attempts: response = requests.get(addr) sleep(2 ** attempts) if not response.ok: print(f'request for {ip} failed with {response.status_code}') attempts += 1 continue break else: raise RuntimeError(f'critical failure (> 5 retries)') resulting_dict = eval(response.content) if resulting_dict['status'] == 'fail': raise RuntimeError(f'ip was invalid') # the given timezone is like, 'Australia/Sydney'. we need to convert to # a datetime.timezone type timezone_str = resulting_dict['timezone'] if timezone_str == 'Europe/Kyiv': timezone_str = 'Europe/Kiev' tzfile = tz.gettz(timezone_str) as_timedelta = tzfile.utcoffset(datetime.utcnow()) as_timezone_type = timezone(as_timedelta) self.cache[ip] = ((resulting_dict['lat'], resulting_dict['lon']), as_timezone_type) return self.cache[ip] def __enter__(self, filename=picklefile_name, /): chdir(self.creation_dir) with Path(filename) as p: if p.exists(): with open(filename, 'rb') as f: cache = pickle.load(f) else: cache = {} self.filename = filename self.cache = cache return self def __exit__(self, err_type, err_value, traceback, /): chdir(self.creation_dir) with open(self.filename, 'wb') as f: pickle.dump(self.cache, f) def filter_logs(filename: str): with open(filename, 'r') as f: for line in f: # IP,YYMMDD hhmmss TIMEZONE,STATUS,METHOD,FILE ip, timethings, status, method, filepath = line.split(',', 4) file = '/'.join(filepath.split('/')[4:]).strip() if ( status != '200' or method != 'GET' ): continue yield ip, timethings, status, method, file def latlon_distance(p1, p2) -> float: # black magic do not touch. use the haversine formula to find the distance lat1, lon1 = p1 lat2, lon2 = p2 lon1 = radians(lon1) lat1 = radians(lat1) lon2 = radians(lon2) lat2 = radians(lat2) dlon = lon2 - lon1 dlat = lat2 - lat1 a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2 c = 2 * asin(sqrt(a)) earth_radius_km = 6371 return c * earth_radius_km def analyze_server(server: Path) -> None: if not server.is_dir(): return serverip = get_server_ip(server.name) result = '' filename = f'{server.name}/access.log' self_latlon, _ = ip_cache.get(serverip) for ip, timethings, status, method, file in filter_logs(filename): # get_ip_latlon_tz() returns the latitude, longitude, and timezone # of an ip # TODO: cache results in a picklefile latlon, timezone = ip_cache.get(ip) # convert `timethings` to a datetime object time_of_hit = datetime.strptime(timethings, log_date_format) # convert from its default timezone in GMT to the timezone of the requester localtime = time_of_hit.replace(tzinfo=timezone.utc).astimezone(timezone) # convert this time back to a string for logging purposes, without timezone localdate, localtime = localtime.strftime(log_date_format).split(' ')[:2] localdate = '20' + localdate distance = latlon_distance(self_latlon, latlon) date, time = timethings.split(' ')[:2] lat, lon = latlon result += ( f'{server.name},{ip},{lat},{lon},{distance:.5f},20{date},{time},' + f'{localdate},{localtime},{method},"{file}"\n' ) return result def get_server_ip(servername: str) -> str: # associate servers with ips if servername == 'nova': return '184.73.25.153' elif servername == 'singapore': return '18.139.108.77' elif servername == 'sydney': return '54.206.216.118' elif servername == 'dublin': return '54.194.92.137' elif servername == 'brazil': return '18.228.245.48' raise ValueError(f'{servername} is not a known server') def main(args: list) -> int: if len(args) == 1: # no log dirs provided print('no logdir provided') return 1 outfile = 'analysis.csv' start_dir = Path('.').resolve() logdir = args[1] with Path(logdir) as p: logdir = p.resolve() with open(outfile, 'w') as f: f.write('server,ip,distance,sv date,sv time,local date,local time,verb,filename\n') chdir(logdir) serverdir = Path('.') for subdir in serverdir.iterdir(): csv_lines = analyze_server(subdir) f.write(csv_lines) chdir(start_dir) return 0 log_date_format = r'%y/%m/%d %H:%M:%S %z' if __name__ == '__main__': with IPCache() as ip_cache: code = main(sys.argv) sys.exit(code)