sciencefair2023/analyze.py

154 lines
4.9 KiB
Python
Raw Normal View History

2022-12-28 23:31:24 -05:00
import sys
import requests
2022-12-30 14:14:29 -05:00
from math import radians, cos, sin, acos, asin, sqrt
from pathlib import Path
from os import chdir
2023-01-03 21:54:09 -05:00
from datetime import datetime, timezone
from dateutil import tz
import pickle
class IPCache:
picklefile_name = 'cached_ips.pkl'
def __init__(self, /):
with Path('.') as p:
self.creation_dir = p.resolve()
def get(self, ip, /):
if ip in self.cache:
return self.cache[ip]
addr = f'http://ip-api.com/json/{ip}'
response = requests.get(addr)
if not response.ok:
raise RuntimeError(f'request for ip failed with {response.status_code}')
resulting_dict = eval(response.content)
if resulting_dict['status'] == 'fail':
raise RuntimeError(f'ip was invalid')
# the given timezone is like, 'Australia/Sydney'. we need to convert to
# a datetime.timezone type
timezone_str = resulting_dict['timezone']
tzfile = tz.gettz(timezone_str)
as_timedelta = tzfile.utcoffset(datetime.utcnow())
as_timezone_type = timezone(as_timedelta)
self.cache[ip] = ((resulting_dict['lat'], resulting_dict['lon']), as_timezone_type)
return self.cache[ip]
def __enter__(self, filename=picklefile_name, /):
chdir(self.creation_dir)
with Path(filename) as p:
if p.exists():
with open(filename, 'rb') as f:
cache = pickle.load(f)
else:
cache = {}
self.filename = filename
self.cache = cache
return self
def __exit__(self, err_type, err_value, traceback, /):
chdir(self.creation_dir)
with open(self.filename, 'wb') as f:
pickle.dump(self.cache, f)
def filter_logs(filename: str):
2022-12-28 23:31:24 -05:00
with open(filename, 'r') as f:
for line in f:
# IP,YYMMDD hhmmss TIMEZONE,STATUS,METHOD,FILE
ip, timethings, status, method, filepath = line.split(',', 4)
file = '/'.join(filepath.split('/')[4:]).strip()
2022-12-28 23:31:24 -05:00
if (
status != '200'
or method != 'GET'
): continue
2023-01-03 21:54:09 -05:00
yield ip, timethings, status, method, file
2022-12-30 14:14:29 -05:00
def latlon_distance(p1, p2) -> float:
# black magic do not touch. use the haversine formula to find the distance
lat1, lon1 = p1
lat2, lon2 = p2
lon1 = radians(lon1)
lat1 = radians(lat1)
lon2 = radians(lon2)
lat2 = radians(lat2)
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
c = 2 * asin(sqrt(a))
earth_radius_km = 6371
return c * earth_radius_km
2023-01-03 22:14:28 -05:00
def analyze_server(server: Path) -> None:
if not server.is_dir():
return
2023-01-03 22:14:28 -05:00
serverip = get_server_ip(server.name)
2023-01-03 21:54:09 -05:00
result = ''
filename = f'{server.name}/access.log'
2023-01-03 21:54:09 -05:00
self_latlon, _ = ip_cache.get(serverip)
for ip, timethings, status, method, file in filter_logs(filename):
# get_ip_latlon_tz() returns the latitude, longitude, and timezone
# of an ip
# TODO: cache results in a picklefile
latlon, timezone = ip_cache.get(ip)
# convert `timethings` to a datetime object
time_of_hit = datetime.strptime(timethings, log_date_format)
# convert from its default timezone in GMT to the timezone of the requester
localtime = time_of_hit.replace(tzinfo=timezone.utc).astimezone(timezone)
# convert this time back to a string for logging purposes, without timezone
localtime_str = ' '.join(localtime.strftime(log_date_format).split(' ')[:2])
2023-01-03 21:54:09 -05:00
distance = latlon_distance(self_latlon, latlon)
date, time = timethings.split(' ')[:2]
result += (
f'{server.name},{ip},{distance:.5f},{date},{time},'
+ f'{localtime_str},{method},"{file}"\n'
)
2023-01-03 21:54:09 -05:00
return result
def get_server_ip(servername: str) -> str:
# associate servers with ips
if servername == 'nova':
return '184.73.25.153'
elif servername == 'singapore':
return '18.139.108.77'
elif servername == 'sydney':
return '54.206.216.118'
elif servername == 'dublin':
return '54.194.92.137'
elif servername == 'brazil':
return '18.228.245.48'
raise ValueError(f'{servername} is not a known server')
def main(args: list) -> int:
if len(args) == 1:
# no log dirs provided
print('no logdir provided')
return 1
2023-01-03 21:54:09 -05:00
outfile = 'analysis.csv'
start_dir = Path('.').resolve()
2023-01-03 22:14:28 -05:00
with open(outfile, 'w') as f:
for logdir in args[1:]:
chdir(logdir)
serverdir = Path('.')
for subdir in serverdir.iterdir():
csv_lines = analyze_server(subdir)
f.write(csv_lines)
chdir(start_dir)
return 0
2023-01-03 21:54:09 -05:00
log_date_format = r'%y/%m/%d %H:%M:%S %z'
2022-12-28 23:31:24 -05:00
if __name__ == '__main__':
2023-01-03 21:54:09 -05:00
with IPCache() as ip_cache:
sys.exit(main(sys.argv))