Skip to content

Commit 74e478e

Browse files
committed
fix export to CSV for time series, collocation, and aggregation
1 parent 009cb50 commit 74e478e

9 files changed

Lines changed: 187 additions & 79 deletions

File tree

python/philologic/runtime/__init__.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,21 +7,27 @@
77
from philologic.runtime.pages import page_interval
88
from philologic.runtime.Query import parse_query
99
from philologic.runtime.reports import (
10+
aggregation_by_field,
11+
aggregation_to_csv,
1012
bibliography_results,
13+
bibliography_to_csv,
1114
collocation_results,
15+
collocation_to_csv,
1216
concordance_results,
17+
concordance_to_csv,
1318
frequency_results,
1419
generate_text_object,
1520
generate_time_series,
1621
generate_toc_object,
1722
generate_word_frequency,
1823
get_start_end_date,
19-
kwic_hit_object,
20-
kwic_results,
2124
group_by_metadata,
2225
group_by_range,
26+
kwic_hit_object,
27+
kwic_results,
28+
kwic_to_csv,
2329
landing_page_bibliography,
24-
aggregation_by_field,
30+
time_series_to_csv,
2531
)
2632
from philologic.runtime.web_config import WebConfig
2733
from philologic.runtime.WSGIHandler import WSGIHandler
Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
"""Report exports"""
22

3-
from philologic.runtime.reports.concordance import concordance_results
4-
from philologic.runtime.reports.bibliography import bibliography_results
5-
from philologic.runtime.reports.time_series import generate_time_series, get_start_end_date
3+
from philologic.runtime.reports.concordance import concordance_results, concordance_to_csv
4+
from philologic.runtime.reports.bibliography import bibliography_results, bibliography_to_csv
5+
from philologic.runtime.reports.time_series import generate_time_series, get_start_end_date, time_series_to_csv
66
from philologic.runtime.reports.navigation import generate_text_object
77
from philologic.runtime.reports.table_of_contents import generate_toc_object
8-
from philologic.runtime.reports.kwic import kwic_results, kwic_hit_object
8+
from philologic.runtime.reports.kwic import kwic_results, kwic_hit_object, kwic_to_csv
99
from philologic.runtime.reports.generate_word_frequency import generate_word_frequency
1010
from philologic.runtime.reports.frequency import frequency_results
11-
from philologic.runtime.reports.collocation import collocation_results
11+
from philologic.runtime.reports.collocation import collocation_results, collocation_to_csv
1212
from philologic.runtime.reports.landing_page import landing_page_bibliography, group_by_range, group_by_metadata
13-
from philologic.runtime.reports.aggregation import aggregation_by_field
13+
from philologic.runtime.reports.aggregation import aggregation_by_field, aggregation_to_csv

python/philologic/runtime/reports/aggregation.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
# /usr/bin/env python3
22
"""Report designed to group results by metadata with additional breakdown optional"""
33

4+
import csv
5+
import io
6+
47
import numpy as np
58

69
from philologic.runtime.DB import DB
@@ -137,6 +140,45 @@ def aggregation_by_field(request, config):
137140
}
138141

139142

143+
def aggregation_to_csv(results, break_up_field_name=""):
144+
"""Convert aggregation results to CSV string.
145+
146+
Each breakdown entry gets its own row. Rows from the same group
147+
are contiguous, with the group-level metadata repeated.
148+
"""
149+
if not results:
150+
return ""
151+
output = io.StringIO()
152+
first = results[0]
153+
group_keys = sorted(k for k in first["metadata_fields"].keys() if k not in ("field_name", "philo_id"))
154+
has_breakdown = break_up_field_name and any(r["break_up_field"] for r in results)
155+
if has_breakdown:
156+
# Collect all metadata keys from breakdown entries
157+
breakdown_keys = set()
158+
for result in results:
159+
for sub in result["break_up_field"]:
160+
breakdown_keys.update(k for k in sub["metadata_fields"].keys() if k not in ("field_name", "philo_id"))
161+
breakdown_keys = sorted(breakdown_keys - set(group_keys))
162+
fieldnames = group_keys + ["group_count"] + breakdown_keys + ["count"]
163+
else:
164+
fieldnames = group_keys + ["count"]
165+
writer = csv.DictWriter(output, fieldnames=fieldnames)
166+
writer.writeheader()
167+
for result in results:
168+
group_fields = {k: result["metadata_fields"].get(k, "") for k in group_keys}
169+
if has_breakdown and result["break_up_field"]:
170+
for sub in result["break_up_field"]:
171+
row = {**group_fields, "group_count": result["count"]}
172+
for k in breakdown_keys:
173+
row[k] = sub["metadata_fields"].get(k, "")
174+
row["count"] = sub["count"]
175+
writer.writerow(row)
176+
else:
177+
row = {**group_fields, "count": result["count"]}
178+
writer.writerow(row)
179+
return output.getvalue()
180+
181+
140182
def __expand_hits_counted(hits, metadata_type):
141183
"""Stream sorted hitlist with numpy, return per-ID hit counts.
142184

python/philologic/runtime/reports/bibliography.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#!/var/lib/philologic5/philologic_env/bin/python3
22
"""Bibliography results"""
33

4+
import csv
5+
import io
46

57
from philologic.runtime.citations import citation_links, citations
68
from philologic.runtime.DB import DB
@@ -77,3 +79,19 @@ def bibliography_results(request, config):
7779
bibliography_object["query_done"] = hits.done
7880
bibliography_object["result_type"] = result_type
7981
return bibliography_object, hits
82+
83+
84+
def bibliography_to_csv(results):
85+
"""Convert bibliography results to CSV string."""
86+
if not results:
87+
return ""
88+
output = io.StringIO()
89+
metadata_keys = sorted(results[0]["metadata_fields"].keys())
90+
fieldnames = ["philo_id"] + metadata_keys
91+
writer = csv.DictWriter(output, fieldnames=fieldnames)
92+
writer.writeheader()
93+
for result in results:
94+
row = {"philo_id": " ".join(str(x) for x in result["philo_id"])}
95+
row.update(result["metadata_fields"])
96+
writer.writerow(row)
97+
return output.getvalue()

python/philologic/runtime/reports/collocation.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,21 @@ def collocation_results(request, config):
580580
return collocation_object
581581

582582

583+
def collocation_to_csv(collocates):
584+
"""Convert collocation results (list of (word, count) tuples) to CSV string."""
585+
import csv
586+
import io
587+
588+
if not collocates:
589+
return ""
590+
output = io.StringIO()
591+
writer = csv.DictWriter(output, fieldnames=["collocate", "count"])
592+
writer.writeheader()
593+
for word, count in collocates:
594+
writer.writerow({"collocate": word, "count": count})
595+
return output.getvalue()
596+
597+
583598
def atomic_pickle_dump(data, file_path):
584599
"""Write pickle atomically to prevent truncated reads from concurrent requests."""
585600
dir_path = os.path.dirname(file_path)

python/philologic/runtime/reports/concordance.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
#!/var/lib/philologic5/philologic_env/bin/python3
22
"""Concordance report"""
33

4+
import csv
5+
import io
6+
47
import regex as re
58
from philologic.runtime.citations import citation_links, citations
69
from philologic.runtime.DB import DB
@@ -60,3 +63,23 @@ def concordance_results(request, config):
6063
concordance_object["results_length"] = len(hits)
6164
concordance_object["query_done"] = hits.done
6265
return concordance_object
66+
67+
68+
def concordance_to_csv(results, filter_html=False):
69+
"""Convert concordance results to CSV string."""
70+
if not results:
71+
return ""
72+
tags_re = re.compile(r"<[^>]+>")
73+
output = io.StringIO()
74+
metadata_keys = sorted(results[0]["metadata_fields"].keys())
75+
fieldnames = ["philo_id", "context"] + metadata_keys
76+
writer = csv.DictWriter(output, fieldnames=fieldnames)
77+
writer.writeheader()
78+
for result in results:
79+
context = result["context"]
80+
if filter_html:
81+
context = tags_re.sub("", context).strip()
82+
row = {"philo_id": " ".join(str(x) for x in result["philo_id"]), "context": context}
83+
row.update(result["metadata_fields"])
84+
writer.writerow(row)
85+
return output.getvalue()

python/philologic/runtime/reports/kwic.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#!/var/lib/philologic5/philologic_env/bin/python3
22
"""KWIC results"""
33

4+
import csv
5+
import io
46

57
import regex as re
68
from philologic.runtime.citations import citation_links, citations
@@ -92,3 +94,23 @@ def kwic_hit_object(hit, config, db):
9294
}
9395

9496
return kwic_result
97+
98+
99+
def kwic_to_csv(results, filter_html=False):
100+
"""Convert KWIC results to CSV string."""
101+
if not results:
102+
return ""
103+
tags_re = re.compile(r"<[^>]+>")
104+
output = io.StringIO()
105+
metadata_keys = sorted(results[0]["metadata_fields"].keys())
106+
fieldnames = ["philo_id", "context"] + metadata_keys
107+
writer = csv.DictWriter(output, fieldnames=fieldnames)
108+
writer.writeheader()
109+
for result in results:
110+
context = result["context"]
111+
if filter_html:
112+
context = tags_re.sub("", context).strip()
113+
row = {"philo_id": " ".join(str(x) for x in result["philo_id"]), "context": context}
114+
row.update(result["metadata_fields"])
115+
writer.writerow(row)
116+
return output.getvalue()

python/philologic/runtime/reports/time_series.py

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
"""Time series"""
33

44
import os
5-
import time
65

76
import numba
87
import numpy as np
@@ -96,7 +95,6 @@ def _bucket_hits_by_year(doc_ids, year_array, start_date, interval, n_ranges):
9695

9796

9897
def generate_time_series(request, config):
99-
t0 = time.time()
10098
db = DB(config.db_path + "/data/")
10199
year_field = validate_column(config.time_series_year_field, db)
102100
time_series_object = {"query": dict([i for i in request]), "query_done": False}
@@ -109,25 +107,24 @@ def generate_time_series(request, config):
109107
time_series_object["results"] = {"absolute_count": {}, "date_count": {}}
110108
return time_series_object
111109

112-
interval = int(request.year_interval)
110+
try:
111+
interval = int(request.year_interval)
112+
except (ValueError, TypeError):
113+
interval = int(config.time_series_interval)
113114

114115
# Get cached doc→year mapping (SQL only on first request per worker)
115-
t1 = time.time()
116116
year_array, year_word_counts, year_doc_counts, min_date, max_date = _get_doc_year_data(db, year_field)
117-
print(f"[time_series] doc year data: {time.time()-t1:.3f}s", flush=True)
118117

119118
# Resolve start/end dates
120119
start_date = int(request.start_date) if request.start_date else min_date
121120
end_date = int(request.end_date) if request.end_date else max_date
122121

123122
# Fire the word query now that we have start/end dates
124-
t1 = time.time()
125123
hits = None
126124
if request.q:
127125
metadata = dict(request.metadata)
128126
metadata[year_field] = "%d-%d" % (start_date, end_date)
129127
hits = db.query(request["q"], request["method"], request["arg"], raw_results=True, **metadata)
130-
print(f"[time_series] db.query dispatch: {time.time()-t1:.3f}s", flush=True)
131128

132129
# Generate date ranges for output
133130
date_ranges = []
@@ -150,27 +147,18 @@ def generate_time_series(request, config):
150147

151148
# Absolute hit counts: wait for search, then vectorized bucketing
152149
if hits is not None:
153-
t1 = time.time()
154150
hits.finish()
155-
t_finish = time.time() - t1
156151
total_hits = len(hits)
157-
print(f"[time_series] hits.finish() wait ({total_hits} hits): {t_finish:.3f}s", flush=True)
158152

159153
if total_hits > 0:
160-
t1 = time.time()
161154
hit_length = hits.length
162155
mm = np.memmap(hits.filename, dtype="u4", mode="r").reshape(-1, hit_length)
163156
doc_ids = np.ascontiguousarray(mm[:, 0])
164157
del mm # release mmap immediately
165-
t_read = time.time() - t1
166158

167-
# Single-pass JIT on contiguous doc_id column
168-
t1 = time.time()
169159
bin_counts, total_hits = _bucket_hits_by_year(
170160
doc_ids, year_array, start_date, interval, n_ranges
171161
)
172-
t_jit = time.time() - t1
173-
print(f"[time_series] mmap+extract doc_ids: {t_read:.3f}s, JIT bucket: {t_jit:.3f}s ({total_hits} hits in {n_ranges} bins)", flush=True)
174162
else:
175163
bin_counts = np.zeros(n_ranges, dtype=np.int64)
176164
else:
@@ -182,7 +170,6 @@ def generate_time_series(request, config):
182170
total_hits += int(bin_counts[i])
183171

184172
# Build absolute_count output matching expected format
185-
t1 = time.time()
186173
absolute_count = {}
187174
for i, (range_start, date_range) in enumerate(date_ranges):
188175
params = {"report": "concordance", "start": "0", "end": "0"}
@@ -193,7 +180,6 @@ def generate_time_series(request, config):
193180
"count": int(bin_counts[i]),
194181
"url": url,
195182
}
196-
print(f"[time_series] build output ({n_ranges} ranges): {time.time()-t1:.3f}s", flush=True)
197183

198184
time_series_object["results_length"] = int(total_hits)
199185
time_series_object["more_results"] = False
@@ -202,10 +188,31 @@ def generate_time_series(request, config):
202188
"date_count": {str(date): count for date, count in date_counts.items()},
203189
}
204190

205-
print(f"[time_series] TOTAL: {time.time()-t0:.3f}s", flush=True)
206191
return time_series_object
207192

208193

194+
def time_series_to_csv(results):
195+
"""Convert time series results to CSV string."""
196+
import csv
197+
import io
198+
199+
absolute_count = results.get("absolute_count", {})
200+
date_count = results.get("date_count", {})
201+
if not absolute_count:
202+
return ""
203+
output = io.StringIO()
204+
writer = csv.DictWriter(output, fieldnames=["period", "count", "total_words"])
205+
writer.writeheader()
206+
for period_start in sorted(absolute_count.keys(), key=int):
207+
entry = absolute_count[period_start]
208+
writer.writerow({
209+
"period": entry["label"],
210+
"count": entry["count"],
211+
"total_words": date_count.get(period_start, ""),
212+
})
213+
return output.getvalue()
214+
215+
209216
def get_start_end_date(db, config, start_date=None, end_date=None):
210217
"""Get start and end date of dataset"""
211218
year_field = validate_column(config.time_series_year_field, db)

0 commit comments

Comments
 (0)