sciencefair2023/analyze.py

173 lines
5.5 KiB
Python

import sys
import requests
from math import radians, cos, sin, acos, asin, sqrt
from pathlib import Path
from os import chdir
from datetime import datetime, timezone
from dateutil import tz
import pickle
from time import sleep
class IPCache:
picklefile_name = 'cached_ips.pkl'
def __init__(self, /):
with Path('.') as p:
self.creation_dir = p.resolve()
def get(self, ip, /):
if ip in self.cache:
return self.cache[ip]
print(f'{ip} not in cache')
addr = f'http://ip-api.com/json/{ip}'
attempts = 1
max_attempts = 5
while attempts <= max_attempts:
response = requests.get(addr)
sleep(2 ** attempts)
if not response.ok:
print(f'request for {ip} failed with {response.status_code}')
attempts += 1
continue
break
else:
raise RuntimeError(f'critical failure (> 5 retries)')
resulting_dict = eval(response.content)
if resulting_dict['status'] == 'fail':
raise RuntimeError(f'ip was invalid')
# the given timezone is like, 'Australia/Sydney'. we need to convert to
# a datetime.timezone type
timezone_str = resulting_dict['timezone']
if timezone_str == 'Europe/Kyiv':
timezone_str = 'Europe/Kiev'
tzfile = tz.gettz(timezone_str)
as_timedelta = tzfile.utcoffset(datetime.utcnow())
as_timezone_type = timezone(as_timedelta)
self.cache[ip] = ((resulting_dict['lat'], resulting_dict['lon']), as_timezone_type)
return self.cache[ip]
def __enter__(self, filename=picklefile_name, /):
chdir(self.creation_dir)
with Path(filename) as p:
if p.exists():
with open(filename, 'rb') as f:
cache = pickle.load(f)
else:
cache = {}
self.filename = filename
self.cache = cache
return self
def __exit__(self, err_type, err_value, traceback, /):
chdir(self.creation_dir)
with open(self.filename, 'wb') as f:
pickle.dump(self.cache, f)
def filter_logs(filename: str):
with open(filename, 'r') as f:
for line in f:
# IP,YYMMDD hhmmss TIMEZONE,STATUS,METHOD,FILE
ip, timethings, status, method, filepath = line.split(',', 4)
file = '/'.join(filepath.split('/')[4:]).strip()
if (
status != '200'
or method != 'GET'
): continue
yield ip, timethings, status, method, file
def latlon_distance(p1, p2) -> float:
# black magic do not touch. use the haversine formula to find the distance
lat1, lon1 = p1
lat2, lon2 = p2
lon1 = radians(lon1)
lat1 = radians(lat1)
lon2 = radians(lon2)
lat2 = radians(lat2)
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
c = 2 * asin(sqrt(a))
earth_radius_km = 6371
return c * earth_radius_km
def analyze_server(server: Path) -> None:
if not server.is_dir():
return
serverip = get_server_ip(server.name)
result = ''
filename = f'{server.name}/access.log'
self_latlon, _ = ip_cache.get(serverip)
for ip, timethings, status, method, file in filter_logs(filename):
# get_ip_latlon_tz() returns the latitude, longitude, and timezone
# of an ip
# TODO: cache results in a picklefile
latlon, timezone = ip_cache.get(ip)
# convert `timethings` to a datetime object
time_of_hit = datetime.strptime(timethings, log_date_format)
# convert from its default timezone in GMT to the timezone of the requester
localtime = time_of_hit.replace(tzinfo=timezone.utc).astimezone(timezone)
# convert this time back to a string for logging purposes, without timezone
localdate, localtime = localtime.strftime(log_date_format).split(' ')[:2]
localdate = '20' + localdate
distance = latlon_distance(self_latlon, latlon)
date, time = timethings.split(' ')[:2]
lat, lon = latlon
result += (
f'{server.name},{ip},{lat},{lon},{distance:.5f},20{date},{time},'
+ f'{localdate},{localtime},{method},"{file}"\n'
)
return result
def get_server_ip(servername: str) -> str:
# associate servers with ips
if servername == 'nova':
return '184.73.25.153'
elif servername == 'singapore':
return '18.139.108.77'
elif servername == 'sydney':
return '54.206.216.118'
elif servername == 'dublin':
return '54.194.92.137'
elif servername == 'brazil':
return '18.228.245.48'
raise ValueError(f'{servername} is not a known server')
def main(args: list) -> int:
if len(args) == 1:
# no log dirs provided
print('no logdir provided')
return 1
outfile = 'analysis.csv'
start_dir = Path('.').resolve()
logdir = args[1]
with Path(logdir) as p:
logdir = p.resolve()
with open(outfile, 'w') as f:
f.write('server,ip,distance,sv date,sv time,local date,local time,verb,filename\n')
chdir(logdir)
serverdir = Path('.')
for subdir in serverdir.iterdir():
csv_lines = analyze_server(subdir)
f.write(csv_lines)
chdir(start_dir)
return 0
log_date_format = r'%y/%m/%d %H:%M:%S %z'
if __name__ == '__main__':
with IPCache() as ip_cache:
code = main(sys.argv)
sys.exit(code)