22"""Time series"""
33
44import os
5- import time
65
76import numba
87import numpy as np
@@ -96,7 +95,6 @@ def _bucket_hits_by_year(doc_ids, year_array, start_date, interval, n_ranges):
9695
9796
9897def generate_time_series (request , config ):
99- t0 = time .time ()
10098 db = DB (config .db_path + "/data/" )
10199 year_field = validate_column (config .time_series_year_field , db )
102100 time_series_object = {"query" : dict ([i for i in request ]), "query_done" : False }
@@ -109,25 +107,24 @@ def generate_time_series(request, config):
109107 time_series_object ["results" ] = {"absolute_count" : {}, "date_count" : {}}
110108 return time_series_object
111109
112- interval = int (request .year_interval )
110+ try :
111+ interval = int (request .year_interval )
112+ except (ValueError , TypeError ):
113+ interval = int (config .time_series_interval )
113114
114115 # Get cached doc→year mapping (SQL only on first request per worker)
115- t1 = time .time ()
116116 year_array , year_word_counts , year_doc_counts , min_date , max_date = _get_doc_year_data (db , year_field )
117- print (f"[time_series] doc year data: { time .time ()- t1 :.3f} s" , flush = True )
118117
119118 # Resolve start/end dates
120119 start_date = int (request .start_date ) if request .start_date else min_date
121120 end_date = int (request .end_date ) if request .end_date else max_date
122121
123122 # Fire the word query now that we have start/end dates
124- t1 = time .time ()
125123 hits = None
126124 if request .q :
127125 metadata = dict (request .metadata )
128126 metadata [year_field ] = "%d-%d" % (start_date , end_date )
129127 hits = db .query (request ["q" ], request ["method" ], request ["arg" ], raw_results = True , ** metadata )
130- print (f"[time_series] db.query dispatch: { time .time ()- t1 :.3f} s" , flush = True )
131128
132129 # Generate date ranges for output
133130 date_ranges = []
@@ -150,27 +147,18 @@ def generate_time_series(request, config):
150147
151148 # Absolute hit counts: wait for search, then vectorized bucketing
152149 if hits is not None :
153- t1 = time .time ()
154150 hits .finish ()
155- t_finish = time .time () - t1
156151 total_hits = len (hits )
157- print (f"[time_series] hits.finish() wait ({ total_hits } hits): { t_finish :.3f} s" , flush = True )
158152
159153 if total_hits > 0 :
160- t1 = time .time ()
161154 hit_length = hits .length
162155 mm = np .memmap (hits .filename , dtype = "u4" , mode = "r" ).reshape (- 1 , hit_length )
163156 doc_ids = np .ascontiguousarray (mm [:, 0 ])
164157 del mm # release mmap immediately
165- t_read = time .time () - t1
166158
167- # Single-pass JIT on contiguous doc_id column
168- t1 = time .time ()
169159 bin_counts , total_hits = _bucket_hits_by_year (
170160 doc_ids , year_array , start_date , interval , n_ranges
171161 )
172- t_jit = time .time () - t1
173- print (f"[time_series] mmap+extract doc_ids: { t_read :.3f} s, JIT bucket: { t_jit :.3f} s ({ total_hits } hits in { n_ranges } bins)" , flush = True )
174162 else :
175163 bin_counts = np .zeros (n_ranges , dtype = np .int64 )
176164 else :
@@ -182,7 +170,6 @@ def generate_time_series(request, config):
182170 total_hits += int (bin_counts [i ])
183171
184172 # Build absolute_count output matching expected format
185- t1 = time .time ()
186173 absolute_count = {}
187174 for i , (range_start , date_range ) in enumerate (date_ranges ):
188175 params = {"report" : "concordance" , "start" : "0" , "end" : "0" }
@@ -193,7 +180,6 @@ def generate_time_series(request, config):
193180 "count" : int (bin_counts [i ]),
194181 "url" : url ,
195182 }
196- print (f"[time_series] build output ({ n_ranges } ranges): { time .time ()- t1 :.3f} s" , flush = True )
197183
198184 time_series_object ["results_length" ] = int (total_hits )
199185 time_series_object ["more_results" ] = False
@@ -202,10 +188,31 @@ def generate_time_series(request, config):
202188 "date_count" : {str (date ): count for date , count in date_counts .items ()},
203189 }
204190
205- print (f"[time_series] TOTAL: { time .time ()- t0 :.3f} s" , flush = True )
206191 return time_series_object
207192
208193
194+ def time_series_to_csv (results ):
195+ """Convert time series results to CSV string."""
196+ import csv
197+ import io
198+
199+ absolute_count = results .get ("absolute_count" , {})
200+ date_count = results .get ("date_count" , {})
201+ if not absolute_count :
202+ return ""
203+ output = io .StringIO ()
204+ writer = csv .DictWriter (output , fieldnames = ["period" , "count" , "total_words" ])
205+ writer .writeheader ()
206+ for period_start in sorted (absolute_count .keys (), key = int ):
207+ entry = absolute_count [period_start ]
208+ writer .writerow ({
209+ "period" : entry ["label" ],
210+ "count" : entry ["count" ],
211+ "total_words" : date_count .get (period_start , "" ),
212+ })
213+ return output .getvalue ()
214+
215+
209216def get_start_end_date (db , config , start_date = None , end_date = None ):
210217 """Get start and end date of dataset"""
211218 year_field = validate_column (config .time_series_year_field , db )
0 commit comments