project/analysis/log_analyzer.py

248 lines
8.9 KiB
Python

import json
import logging
from typing import List
from analysis import analyzers
from analysis.analyzers import get_renderer, render
from analysis.analyzers.analyzer import ResultStore
from analysis.analyzers.analyzer.default import write_logentry_count_csv, write_simulation_flag_csv
from analysis.analyzers.render import wip
from analysis.analyzers.render.default import LogEntryCountCSV, KMLRender, GeoJSON
from analysis.analyzers.render.wip import time_distribution, plot_data
from analysis.analyzers.settings import LogSettings, load_settings, parse_settings
from analysis.loaders import LOADERS
from analysis.util.processing import grep, run_analysis, src_file
from analysis.util.meta_temp import CONFIG_NAMES
logging.basicConfig(format='%(levelname)s %(name)s:%(message)s', level=logging.DEBUG)
log: logging.Logger = logging.getLogger(__name__)
logging.getLogger('requests').setLevel(logging.WARN)
logging.getLogger("urllib3").setLevel(logging.WARN)
logging.getLogger("shapely").setLevel(logging.WARN)
def urach_logs(log_ids, settings):
# return ["data/inst_{id}.{format}".format(id=log_id, format=settings.log_format) for log_id in log_ids]
return ["data/{id}.{format}".format(id=log_id, format=settings.log_format) for log_id in log_ids]
if __name__ == '__main__':
settings = {}
log_ids_gf = []
# settings: LogSettings = load_settings("biogames2.json")
# log_ids_urach: List[str] = urach_logs([
# # "34fecf49dbaca3401d745fb467",
# # "44ea194de594cd8d63ac0314be",
# # "57c444470dbf88605433ca935c",
# # "78e0c545b594e82edfad55bd7f",
# # "91abfd4b31a5562b1c66be37d9",
# # "597b704fe9ace475316c345903",
# # "e01a684aa29dff9ddd9705edf8",
# "597b704fe9ace475316c345903",
# "e01a684aa29dff9ddd9705edf8",
# "fbf9d64ae0bdad0de7efa3eec6",
# # "fbf9d64ae0bdad0de7efa3eec6",
# "fe1331481f85560681f86827ec", # urach
# # "fe1331481f85560681f86827ec"]
# "fec57041458e6cef98652df625",
# ]
# , settings)
# log_ids_gf = grep(["9d11b749c78a57e786bf5c8d28", # filderstadt
# "a192ff420b8bdd899fd28573e2", # eichstätt
# "3a3d994c04b1b1d87168422309", # stadtökologie
# "fe1331481f85560681f86827ec", # urach
# "96f6d9cc556b42f3b2fec0a2cb7ed36e" # oberelsbach
# ],
# "/home/clemens/git/ma/test/src",
# settings)
# log_ids = src_file("/home/clemens/git/ma/test/filtered_5_actions")
if False:
store: ResultStore = run_analysis(log_ids_gf, settings, LOADERS)
# store: ResultStore = run_analysis(log_ids, settings, LOADERS)
if False:
for r in get_renderer(analyzers.LocomotionActionAnalyzer):
r().render(store.get_all())
if False:
render(analyzers.LocationAnalyzer, store.get_all())
# print(json.dumps(store.serializable(), indent=1))
if False:
for cat in store.get_categories():
render(analyzers.ActivityMapper, store.get_category(cat), name=cat)
# render(analyzers.ProgressAnalyzer, store.get_all())
if False:
from analysis.analyzers.postprocessing import graph
g = graph.Cache(settings)
g.run(store)
if False:
# render(analyzers.SimulationOrderAnalyzer, store.get_all())
for cat in store.get_categories():
data = store.get_category(cat)
render(analyzers.SimulationOrderAnalyzer, data, name=cat)
if False:
write_logentry_count_csv(LogEntryCountCSV, store, render, analyzers)
if False:
write_simulation_flag_csv(store)
if False:
time_distribution(store)
if False:
# spatial_data = get_data_distance(store,relative_values=False)
# temporal_data = get_data(store,relative_values=False)
# spatial_data_rel = get_data_distance(store,relative_values=True)
# temporal_data_rel = get_data(store,relative_values=True)
# temporal_data_rel = json.load(open("temporal_rel.json"))
# spatial_data_rel = json.load(open("spatial_rel.json"))
# import IPython
# IPython.embed()
# print(json.dumps(get_all_data(store)))
# json.dump(get_all_data(store), open("combined.json", "w"))
# combined = get_all_data(store, sort=True, relative=True)
# json.dump(combined, open("combined_rel.json", "w"))
# combined = json.load(open("combined_rel.json"))
combined = json.load(open("combined_total.json"))
# plot_time_space_rel(combined, keys)
plot_data(combined, wip.keys)
if False:
def store(x):
pass
settings: LogSettings = load_settings("../oeb_kml.json")
#log_ids = src_file("/app/log_data/oeb/oeb_2016_path")
log_ids = src_file("/app/log_data/oeb/oeb_paths")
#log_ids = log_ids[0:10]
print(log_ids)
store: ResultStore = run_analysis(log_ids, settings, LOADERS, ResultStore(key_index=1))
print("render")
kml = GeoJSON()
fields = store.get_categories()
artifacts = {key: kml.render(store.get_category(key)) for key in fields}
print(artifacts)
print("done")
def atrifact_to_length(filename):
g = json.load(open(filename))
from analysis.util.geo import calc_distance
return calc_distance(json.dumps(g), "features.0.geometry.coordinates")
def simplified_length(filename):
from analysis.util.geo import json_to_track,distance
g = json.load(open(filename))
track = json_to_track(json.dumps(g), "features.0.geometry.coordinates")
simplified = track.simplify(0.0002, preserve_topology=True)
from shapely.geometry import mapping
json.dump(mapping(simplified), open(f"{filename}.simplified.geojson","w"), indent=1)
return distance(simplified)
from collections import defaultdict
def get_lengths(artifacts, atrifact_to_length=atrifact_to_length):
stats = defaultdict(list)
for field in artifacts:
print(field, CONFIG_NAMES[field])
for i in artifacts[field]:
distance = atrifact_to_length(i)
warn = "\tLONG!" if distance > 10000 else "\tSHORT!" if distance < 1000 else ""
print(f"\t{i}\t{distance}{warn}")
stats[field].append(distance)
return stats
stats = get_lengths(artifacts)
import numpy as np
def quart_1(x):
return np.percentile(x, 25)
def quart_2(x):
return np.percentile(x, 50)
def quart_3(x):
return np.percentile(x, 75)
def quart_4(x):
return np.percentile(x, 100)
def print_stats(stats):
fns = [np.size, np.min, np.max, np.mean, np.median]#, quart_1, quart_2, quart_3, quart_4]
names = "\t".join([x.__name__ for x in fns] + ["id","name"])
print(names)
for i in stats:
stat = [f"{fn(stats[i]):.2f}" for fn in fns]
print("\t".join(stat + [i, CONFIG_NAMES[i]]))
def plot_stats(stats, filtered_stats, suffix=""):
import matplotlib.pyplot as plt
keys = sorted(stats.keys())
names = [CONFIG_NAMES[i] for i in keys]
values = [stats[i] for i in keys]
values_filtered = [filtered_stats[i] for i in keys]
fig, ax = plt.subplots()
ax.boxplot(values, labels=names, showfliers=False, showmeans=True, meanline=True)
fig.savefig(f"/app/log_data/oeb/plots/plot_raw{suffix}.png")
fig, ax = plt.subplots()
ax.boxplot(values_filtered, labels=names, showfliers=False, showmeans=True, meanline=True)
fig.savefig(f"/app/log_data/oeb/plots/plot_filtered{suffix}.png")
fig, ax = plt.subplots()
agg_data = values + values_filtered
agg_labels = names + [f"filtered(…{i[-4:]})" for i in names]
ax.boxplot(agg_data, labels=agg_labels, showfliers=False, showmeans=True, meanline=True)
fig.savefig(f"/app/log_data/oeb/plots/plot_combined{suffix}.png")
MIN = 1000
MAX = 100000
def filter(stats):
stats_filtered = defaultdict(list)
for i in stats:
stats_filtered[i] = [x for x in stats[i] if MIN < x < MAX]
return stats_filtered
stats_filtered = filter(stats)
stats_simple = get_lengths(artifacts, atrifact_to_length=simplified_length)
stats_filtered_simple = filter(stats_simple)
def summary(stats, stats_filtered, title):
print_stats(stats)
print(f"filter {MIN} < x < {MAX}")
print_stats(stats_filtered)
plot_stats(stats, stats_filtered, suffix=f"_{title}")
summary(stats, stats_filtered, "raw")
print("\nsimplified\n")
summary(stats_simple, stats_filtered_simple, "simplified")
if True:
settings: LogSettings = load_settings("time.json")
# log_ids = src_file("/app/log_data/oeb/oeb_2016_path")
log_ids = src_file("log_data/oeb/oeb_paths_host")
log_ids = src_file("/home/clemens/git/ma/test/src")
log_ids = src_file("/home/clemens/git/ma/project/log_data/neocartographer/index")
# log_ids = log_ids[0:10]
print(log_ids)
store: ResultStore = run_analysis(log_ids, settings, LOADERS, ResultStore(key_index=1))
results = {}
for cat in store.get_categories():
results[cat] = [result.get() for result in store.get_category(cat)]
with open("times_neo.json", "w") as out:
json.dump(results, out, indent=1)
####################
#for cat in store.get_categories():
# render(analyzers.ActivityMapper, store.get_category(cat), name=cat)
# for analyzers in analyzers:
# if analyzers.name() in ["LogEntryCount", "ActionSequenceAnalyzer"]:
# print(json.dumps(analyzers.result(), indent=2))
# for analyzers in analyzers:
# if analyzers.name() in ["BoardDuration"]:
# print(json.dumps(analyzers.result(), indent=2))
# print(analyzers.render())
# coords = analyzers[1].render()
# with open("test.js", "w") as out:
# out.write("coords = "+coords)