@@ -207,6 +207,108 @@ class textfile_callback : public textfile_sub_source::scan_callback {
207207 bool did_promotion{false };
208208};
209209
210+ static bool
211+ remove_duplicates ()
212+ {
213+ static constexpr size_t MINIMUM_DUPLICATE_SIZE = 100 ;
214+
215+ std::unordered_map<std::string, std::vector<std::shared_ptr<logfile>>>
216+ id_to_files;
217+ bool retval = false ;
218+
219+ for (const auto & lf : lnav_data.ld_active_files .fc_files ) {
220+ if (lf->get_format_ptr () == nullptr ) {
221+ continue ;
222+ }
223+ id_to_files[lf->get_content_id ()].push_back (lf);
224+ }
225+
226+ for (auto & [name, poss_dupes] : id_to_files) {
227+ if (poss_dupes.size () == 1 ) {
228+ continue ;
229+ }
230+
231+ std::sort (poss_dupes.begin (),
232+ poss_dupes.end (),
233+ [](const auto & left, const auto & right) {
234+ const auto & lst = left->get_stat ();
235+ const auto & rst = right->get_stat ();
236+ return lst.st_size < rst.st_size
237+ || (lst.st_size == rst.st_size
238+ && rst.st_mtime < lst.st_mtime );
239+ });
240+
241+ const auto & dupe_name = poss_dupes.back ()->get_unique_path ();
242+ auto main_lf = poss_dupes.back ();
243+ poss_dupes.pop_back ();
244+
245+ if (main_lf->size () < MINIMUM_DUPLICATE_SIZE) {
246+ continue ;
247+ }
248+
249+ std::vector<std::shared_ptr<logfile>> to_remove;
250+ for (const auto & poss_dupe_lf : poss_dupes) {
251+ if (poss_dupe_lf->size () < MINIMUM_DUPLICATE_SIZE) {
252+ // not worth hiding
253+ continue ;
254+ }
255+
256+ if (main_lf->get_format_ptr ()->get_name ()
257+ != poss_dupe_lf->get_format_ptr ()->get_name ())
258+ {
259+ // not a duplicate
260+ continue ;
261+ }
262+
263+ auto found_mismatch = false ;
264+ for (size_t lpc = 0 ; lpc < MINIMUM_DUPLICATE_SIZE; lpc++) {
265+ auto main_iter = main_lf->begin () + lpc;
266+ auto poss_dupe_iter = poss_dupe_lf->begin () + lpc;
267+
268+ if (main_iter->get_time <std::chrono::microseconds>()
269+ != poss_dupe_iter->get_time <std::chrono::microseconds>()
270+ || main_iter->get_msg_level ()
271+ != poss_dupe_iter->get_msg_level ()
272+ || main_iter->get_offset () != poss_dupe_iter->get_offset ())
273+ {
274+ // not a duplicate
275+ found_mismatch = true ;
276+ break ;
277+ }
278+ }
279+
280+ if (!found_mismatch) {
281+ to_remove.push_back (poss_dupe_lf);
282+ }
283+ }
284+
285+ if (to_remove.empty ()) {
286+ continue ;
287+ }
288+ log_info (" Keeping duplicated file: %s; size=%lld; mtime=%ld; path=%s" ,
289+ main_lf->get_content_id ().c_str (),
290+ main_lf->get_stat ().st_size ,
291+ main_lf->get_stat ().st_mtime ,
292+ main_lf->get_filename_as_string ().c_str ());
293+ std::for_each (to_remove.begin (),
294+ to_remove.end (),
295+ [&dupe_name, &retval](auto & lf) {
296+ if (lf->mark_as_duplicate (dupe_name)) {
297+ log_info (
298+ " Hiding copy: size=%lld; mtime=%ld; "
299+ " path=%s" ,
300+ lf->get_stat ().st_size ,
301+ lf->get_stat ().st_mtime ,
302+ lf->get_filename_as_string ().c_str ());
303+ lnav_data.ld_log_source .find_data (lf) |
304+ [](auto ld) { ld->set_visibility (false ); };
305+ retval = true ;
306+ }
307+ });
308+ }
309+ return retval;
310+ }
311+
210312rebuild_indexes_result_t
211313rebuild_indexes (std::optional<ui_clock::time_point> deadline)
212314{
@@ -359,56 +461,7 @@ rebuild_indexes(std::optional<ui_clock::time_point> deadline)
359461 }
360462
361463 if (retval.rir_completed && !retval.rir_rescan_needed ) {
362- std::unordered_map<std::string,
363- std::vector<std::shared_ptr<logfile>>>
364- id_to_files;
365- auto reload = false ;
366-
367- for (const auto & lf : lnav_data.ld_active_files .fc_files ) {
368- if (lf->get_format_ptr () == nullptr ) {
369- continue ;
370- }
371- id_to_files[lf->get_content_id ()].push_back (lf);
372- }
373-
374- for (auto & [name, lf] : id_to_files) {
375- if (lf.size () == 1 ) {
376- continue ;
377- }
378-
379- std::sort (lf.begin (),
380- lf.end (),
381- [](const auto & left, const auto & right) {
382- const auto & lst = left->get_stat ();
383- const auto & rst = right->get_stat ();
384- return lst.st_size < rst.st_size
385- || (lst.st_size == rst.st_size
386- && rst.st_mtime < lst.st_mtime );
387- });
388-
389- const auto & dupe_name = lf.back ()->get_unique_path ();
390- log_info (
391- " Keeping duplicated file: %s; size=%lld; mtime=%ld; "
392- " path=%s" ,
393- lf.back ()->get_content_id ().c_str (),
394- lf.back ()->get_stat ().st_size ,
395- lf.back ()->get_stat ().st_mtime ,
396- lf.back ()->get_filename_as_string ().c_str ());
397- lf.pop_back ();
398- std::for_each (
399- lf.begin (), lf.end (), [&dupe_name, &reload](auto & lf) {
400- if (lf->mark_as_duplicate (dupe_name)) {
401- log_info (
402- " Hiding copy: size=%lld; mtime=%ld; path=%s" ,
403- lf->get_stat ().st_size ,
404- lf->get_stat ().st_mtime ,
405- lf->get_filename_as_string ().c_str ());
406- lnav_data.ld_log_source .find_data (lf) |
407- [](auto ld) { ld->set_visibility (false ); };
408- reload = true ;
409- }
410- });
411- }
464+ auto reload = remove_duplicates ();
412465
413466 if (reload) {
414467 log_trace (
0 commit comments