Skip to content

Commit 46a885b

Browse files
committed
[indexing] be less aggressive when hiding duplicate files
1 parent 35e2f95 commit 46a885b

8 files changed

Lines changed: 520 additions & 51 deletions

NEWS.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,13 @@ Bug Fixes:
9696
but they are displayed now.
9797
- Checks for archives with file paths that could
9898
escape containment.
99+
* The duplicate file check is less aggressive now.
100+
Previously, if the first lines of logfiles matched
101+
exactly, they were considered duplicates and the
102+
smallest/oldest was hidden. Now, the duplication
103+
check is only done on files that contain at least
104+
100 lines and those lines are checked to see if
105+
they have the same timestamp/file-offset.
99106

100107

101108
## lnav v0.14.0

src/lnav.indexing.cc

Lines changed: 103 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,108 @@ class textfile_callback : public textfile_sub_source::scan_callback {
207207
bool did_promotion{false};
208208
};
209209

210+
static bool
211+
remove_duplicates()
212+
{
213+
static constexpr size_t MINIMUM_DUPLICATE_SIZE = 100;
214+
215+
std::unordered_map<std::string, std::vector<std::shared_ptr<logfile>>>
216+
id_to_files;
217+
bool retval = false;
218+
219+
for (const auto& lf : lnav_data.ld_active_files.fc_files) {
220+
if (lf->get_format_ptr() == nullptr) {
221+
continue;
222+
}
223+
id_to_files[lf->get_content_id()].push_back(lf);
224+
}
225+
226+
for (auto& [name, poss_dupes] : id_to_files) {
227+
if (poss_dupes.size() == 1) {
228+
continue;
229+
}
230+
231+
std::sort(poss_dupes.begin(),
232+
poss_dupes.end(),
233+
[](const auto& left, const auto& right) {
234+
const auto& lst = left->get_stat();
235+
const auto& rst = right->get_stat();
236+
return lst.st_size < rst.st_size
237+
|| (lst.st_size == rst.st_size
238+
&& rst.st_mtime < lst.st_mtime);
239+
});
240+
241+
const auto& dupe_name = poss_dupes.back()->get_unique_path();
242+
auto main_lf = poss_dupes.back();
243+
poss_dupes.pop_back();
244+
245+
if (main_lf->size() < MINIMUM_DUPLICATE_SIZE) {
246+
continue;
247+
}
248+
249+
std::vector<std::shared_ptr<logfile>> to_remove;
250+
for (const auto& poss_dupe_lf : poss_dupes) {
251+
if (poss_dupe_lf->size() < MINIMUM_DUPLICATE_SIZE) {
252+
// not worth hiding
253+
continue;
254+
}
255+
256+
if (main_lf->get_format_ptr()->get_name()
257+
!= poss_dupe_lf->get_format_ptr()->get_name())
258+
{
259+
// not a duplicate
260+
continue;
261+
}
262+
263+
auto found_mismatch = false;
264+
for (size_t lpc = 0; lpc < MINIMUM_DUPLICATE_SIZE; lpc++) {
265+
auto main_iter = main_lf->begin() + lpc;
266+
auto poss_dupe_iter = poss_dupe_lf->begin() + lpc;
267+
268+
if (main_iter->get_time<std::chrono::microseconds>()
269+
!= poss_dupe_iter->get_time<std::chrono::microseconds>()
270+
|| main_iter->get_msg_level()
271+
!= poss_dupe_iter->get_msg_level()
272+
|| main_iter->get_offset() != poss_dupe_iter->get_offset())
273+
{
274+
// not a duplicate
275+
found_mismatch = true;
276+
break;
277+
}
278+
}
279+
280+
if (!found_mismatch) {
281+
to_remove.push_back(poss_dupe_lf);
282+
}
283+
}
284+
285+
if (to_remove.empty()) {
286+
continue;
287+
}
288+
log_info("Keeping duplicated file: %s; size=%lld; mtime=%ld; path=%s",
289+
main_lf->get_content_id().c_str(),
290+
main_lf->get_stat().st_size,
291+
main_lf->get_stat().st_mtime,
292+
main_lf->get_filename_as_string().c_str());
293+
std::for_each(to_remove.begin(),
294+
to_remove.end(),
295+
[&dupe_name, &retval](auto& lf) {
296+
if (lf->mark_as_duplicate(dupe_name)) {
297+
log_info(
298+
" Hiding copy: size=%lld; mtime=%ld; "
299+
"path=%s",
300+
lf->get_stat().st_size,
301+
lf->get_stat().st_mtime,
302+
lf->get_filename_as_string().c_str());
303+
lnav_data.ld_log_source.find_data(lf) |
304+
[](auto ld) { ld->set_visibility(false); };
305+
retval = true;
306+
}
307+
});
308+
}
309+
return retval;
310+
}
311+
210312
rebuild_indexes_result_t
211313
rebuild_indexes(std::optional<ui_clock::time_point> deadline)
212314
{
@@ -359,56 +461,7 @@ rebuild_indexes(std::optional<ui_clock::time_point> deadline)
359461
}
360462

361463
if (retval.rir_completed && !retval.rir_rescan_needed) {
362-
std::unordered_map<std::string,
363-
std::vector<std::shared_ptr<logfile>>>
364-
id_to_files;
365-
auto reload = false;
366-
367-
for (const auto& lf : lnav_data.ld_active_files.fc_files) {
368-
if (lf->get_format_ptr() == nullptr) {
369-
continue;
370-
}
371-
id_to_files[lf->get_content_id()].push_back(lf);
372-
}
373-
374-
for (auto& [name, lf] : id_to_files) {
375-
if (lf.size() == 1) {
376-
continue;
377-
}
378-
379-
std::sort(lf.begin(),
380-
lf.end(),
381-
[](const auto& left, const auto& right) {
382-
const auto& lst = left->get_stat();
383-
const auto& rst = right->get_stat();
384-
return lst.st_size < rst.st_size
385-
|| (lst.st_size == rst.st_size
386-
&& rst.st_mtime < lst.st_mtime);
387-
});
388-
389-
const auto& dupe_name = lf.back()->get_unique_path();
390-
log_info(
391-
"Keeping duplicated file: %s; size=%lld; mtime=%ld; "
392-
"path=%s",
393-
lf.back()->get_content_id().c_str(),
394-
lf.back()->get_stat().st_size,
395-
lf.back()->get_stat().st_mtime,
396-
lf.back()->get_filename_as_string().c_str());
397-
lf.pop_back();
398-
std::for_each(
399-
lf.begin(), lf.end(), [&dupe_name, &reload](auto& lf) {
400-
if (lf->mark_as_duplicate(dupe_name)) {
401-
log_info(
402-
" Hiding copy: size=%lld; mtime=%ld; path=%s",
403-
lf->get_stat().st_size,
404-
lf->get_stat().st_mtime,
405-
lf->get_filename_as_string().c_str());
406-
lnav_data.ld_log_source.find_data(lf) |
407-
[](auto ld) { ld->set_visibility(false); };
408-
reload = true;
409-
}
410-
});
411-
}
464+
auto reload = remove_duplicates();
412465

413466
if (reload) {
414467
log_trace(

test/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,7 @@ dist_noinst_DATA = \
412412
logfile_rust_tracing.0 \
413413
logfile_shop_access_log.0 \
414414
logfile_spark.0 \
415+
logfile_spark_dupe.0 \
415416
logfile_strace_log.0 \
416417
logfile_strace_log.1 \
417418
logfile_strace_log.2 \

test/expected/test_sql_views_vtab.sh_4af3a57478faa9f206e90bb0500abbad566d534c.err

Whitespace-only changes.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
view_name,basename(filepath),visible
2+
log,logfile_spark.0,1
3+
log,logfile_spark_dupe.0,0
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
view_name,basename(filepath),visible
22
log,logfile_access_log.0,1
3-
log,logfile_access_log_dupe.0,0
3+
log,logfile_access_log_dupe.0,1

0 commit comments

Comments
 (0)