173 lines
5.5 KiB
Python
173 lines
5.5 KiB
Python
import sys
|
|
import requests
|
|
from math import radians, cos, sin, acos, asin, sqrt
|
|
from pathlib import Path
|
|
from os import chdir
|
|
from datetime import datetime, timezone
|
|
from dateutil import tz
|
|
import pickle
|
|
from time import sleep
|
|
|
|
class IPCache:
|
|
picklefile_name = 'cached_ips.pkl'
|
|
|
|
def __init__(self, /):
|
|
with Path('.') as p:
|
|
self.creation_dir = p.resolve()
|
|
|
|
def get(self, ip, /):
|
|
if ip in self.cache:
|
|
return self.cache[ip]
|
|
print(f'{ip} not in cache')
|
|
addr = f'http://ip-api.com/json/{ip}'
|
|
attempts = 1
|
|
max_attempts = 5
|
|
while attempts <= max_attempts:
|
|
response = requests.get(addr)
|
|
sleep(2 ** attempts)
|
|
if not response.ok:
|
|
print(f'request for {ip} failed with {response.status_code}')
|
|
attempts += 1
|
|
continue
|
|
break
|
|
else:
|
|
raise RuntimeError(f'critical failure (> 5 retries)')
|
|
resulting_dict = eval(response.content)
|
|
if resulting_dict['status'] == 'fail':
|
|
raise RuntimeError(f'ip was invalid')
|
|
# the given timezone is like, 'Australia/Sydney'. we need to convert to
|
|
# a datetime.timezone type
|
|
timezone_str = resulting_dict['timezone']
|
|
if timezone_str == 'Europe/Kyiv':
|
|
timezone_str = 'Europe/Kiev'
|
|
tzfile = tz.gettz(timezone_str)
|
|
as_timedelta = tzfile.utcoffset(datetime.utcnow())
|
|
as_timezone_type = timezone(as_timedelta)
|
|
|
|
self.cache[ip] = ((resulting_dict['lat'], resulting_dict['lon']), as_timezone_type)
|
|
return self.cache[ip]
|
|
|
|
def __enter__(self, filename=picklefile_name, /):
|
|
chdir(self.creation_dir)
|
|
with Path(filename) as p:
|
|
if p.exists():
|
|
with open(filename, 'rb') as f:
|
|
cache = pickle.load(f)
|
|
else:
|
|
cache = {}
|
|
self.filename = filename
|
|
self.cache = cache
|
|
return self
|
|
|
|
def __exit__(self, err_type, err_value, traceback, /):
|
|
chdir(self.creation_dir)
|
|
with open(self.filename, 'wb') as f:
|
|
pickle.dump(self.cache, f)
|
|
|
|
def filter_logs(filename: str):
|
|
with open(filename, 'r') as f:
|
|
for line in f:
|
|
# IP,YYMMDD hhmmss TIMEZONE,STATUS,METHOD,FILE
|
|
ip, timethings, status, method, filepath = line.split(',', 4)
|
|
file = '/'.join(filepath.split('/')[4:]).strip()
|
|
if (
|
|
status != '200'
|
|
or method != 'GET'
|
|
): continue
|
|
yield ip, timethings, status, method, file
|
|
|
|
def latlon_distance(p1, p2) -> float:
|
|
# black magic do not touch. use the haversine formula to find the distance
|
|
lat1, lon1 = p1
|
|
lat2, lon2 = p2
|
|
lon1 = radians(lon1)
|
|
lat1 = radians(lat1)
|
|
lon2 = radians(lon2)
|
|
lat2 = radians(lat2)
|
|
|
|
dlon = lon2 - lon1
|
|
dlat = lat2 - lat1
|
|
a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
|
|
|
|
c = 2 * asin(sqrt(a))
|
|
earth_radius_km = 6371
|
|
|
|
return c * earth_radius_km
|
|
|
|
def analyze_server(server: Path) -> None:
|
|
if not server.is_dir():
|
|
return
|
|
serverip = get_server_ip(server.name)
|
|
|
|
result = ''
|
|
|
|
filename = f'{server.name}/access.log'
|
|
self_latlon, _ = ip_cache.get(serverip)
|
|
for ip, timethings, status, method, file in filter_logs(filename):
|
|
# get_ip_latlon_tz() returns the latitude, longitude, and timezone
|
|
# of an ip
|
|
# TODO: cache results in a picklefile
|
|
latlon, timezone = ip_cache.get(ip)
|
|
|
|
# convert `timethings` to a datetime object
|
|
time_of_hit = datetime.strptime(timethings, log_date_format)
|
|
# convert from its default timezone in GMT to the timezone of the requester
|
|
localtime = time_of_hit.replace(tzinfo=timezone.utc).astimezone(timezone)
|
|
# convert this time back to a string for logging purposes, without timezone
|
|
localdate, localtime = localtime.strftime(log_date_format).split(' ')[:2]
|
|
localdate = '20' + localdate
|
|
|
|
distance = latlon_distance(self_latlon, latlon)
|
|
date, time = timethings.split(' ')[:2]
|
|
lat, lon = latlon
|
|
result += (
|
|
f'{server.name},{ip},{lat},{lon},{distance:.5f},20{date},{time},'
|
|
+ f'{localdate},{localtime},{method},"{file}"\n'
|
|
)
|
|
|
|
return result
|
|
|
|
def get_server_ip(servername: str) -> str:
|
|
# associate servers with ips
|
|
if servername == 'nova':
|
|
return '184.73.25.153'
|
|
elif servername == 'singapore':
|
|
return '18.139.108.77'
|
|
elif servername == 'sydney':
|
|
return '54.206.216.118'
|
|
elif servername == 'dublin':
|
|
return '54.194.92.137'
|
|
elif servername == 'brazil':
|
|
return '18.228.245.48'
|
|
|
|
raise ValueError(f'{servername} is not a known server')
|
|
|
|
def main(args: list) -> int:
|
|
if len(args) == 1:
|
|
# no log dirs provided
|
|
print('no logdir provided')
|
|
return 1
|
|
|
|
outfile = 'analysis.csv'
|
|
start_dir = Path('.').resolve()
|
|
logdir = args[1]
|
|
with Path(logdir) as p:
|
|
logdir = p.resolve()
|
|
|
|
with open(outfile, 'w') as f:
|
|
f.write('server,ip,distance,sv date,sv time,local date,local time,verb,filename\n')
|
|
chdir(logdir)
|
|
serverdir = Path('.')
|
|
for subdir in serverdir.iterdir():
|
|
csv_lines = analyze_server(subdir)
|
|
f.write(csv_lines)
|
|
chdir(start_dir)
|
|
|
|
return 0
|
|
|
|
log_date_format = r'%y/%m/%d %H:%M:%S %z'
|
|
|
|
if __name__ == '__main__':
|
|
with IPCache() as ip_cache:
|
|
code = main(sys.argv)
|
|
sys.exit(code) |