Skip to content

Commit f6a424f

Browse files
committed
feat: skip activities from parent repo in case of fork
1 parent 7685823 commit f6a424f

2 files changed

Lines changed: 142 additions & 5 deletions

File tree

services/apps/git_integration/src/crowdgit/database/crud.py

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ async def get_recently_processed_repository_by_url(url: str) -> Repository | Non
4545
Used to check if a repository needs reprocessing based on the update interval.
4646
"""
4747
sql_query = """
48-
SELECT id, url, state, priority, "lastProcessedAt", "lockedAt", "createdAt", "updatedAt", "maintainerFile", "forkedFrom"
48+
SELECT id, url, state, priority, "lastProcessedAt", "lockedAt", "createdAt", "updatedAt", "maintainerFile", "forkedFrom", "segmentId"
4949
FROM git.repositories
5050
WHERE url = $1
5151
AND "deletedAt" IS NULL
@@ -305,6 +305,56 @@ async def set_maintainer_end_date(
305305
)
306306

307307

308+
async def batch_check_parent_activities(
309+
activity_keys: list[tuple[str, str, str]],
310+
parent_channel: str,
311+
parent_segment_id: str,
312+
) -> set[str]:
313+
"""
314+
Batch check which activities exist in parent repo using full dedup key.
315+
316+
Args:
317+
activity_keys: List of (timestamp, type, sourceId) tuples
318+
parent_channel: Parent repository URL
319+
parent_segment_id: Parent repository segment ID
320+
321+
Returns:
322+
Set of sourceIds that exist in parent repo
323+
"""
324+
if not activity_keys:
325+
return set()
326+
327+
# Use dedup index with ALL fields for optimal performance
328+
# Index: (timestamp, platform, type, sourceId, channel, segmentId)
329+
# Build OR conditions for each (timestamp, type, sourceId) combination
330+
conditions = []
331+
params = ["git", parent_channel, parent_segment_id]
332+
param_idx = 4
333+
334+
for timestamp_str, activity_type, source_id in activity_keys:
335+
conditions.append(
336+
f'("timestamp" = ${param_idx} AND "type" = ${param_idx + 1} AND "sourceId" = ${param_idx + 2})'
337+
)
338+
timestamp = datetime.fromisoformat(timestamp_str)
339+
params.append(timestamp)
340+
params.append(activity_type)
341+
params.append(source_id)
342+
param_idx += 3
343+
344+
sql_query = f"""
345+
SELECT DISTINCT "sourceId"
346+
FROM "activityRelations"
347+
WHERE "platform" = $1
348+
AND "channel" = $2
349+
AND "segmentId" = $3
350+
AND ({" OR ".join(conditions)})
351+
"""
352+
353+
result = await query(sql_query, tuple(params))
354+
355+
return {row["sourceId"] for row in result}
356+
357+
308358
async def save_service_execution(service_execution: ServiceExecution) -> None:
309359
"""
310360
Save service execution record to database.

services/apps/git_integration/src/crowdgit/services/commit/commit_service.py

Lines changed: 91 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,11 @@
1313
from pydantic import validate_email
1414
from tenacity import retry, stop_after_attempt, wait_fixed
1515

16-
from crowdgit.database.crud import batch_insert_activities, save_service_execution
16+
from crowdgit.database.crud import (
17+
batch_check_parent_activities,
18+
batch_insert_activities,
19+
save_service_execution,
20+
)
1721
from crowdgit.enums import (
1822
DataSinkWorkerQueueMessageType,
1923
ErrorCode,
@@ -115,6 +119,7 @@ async def process_single_batch_commits(
115119
"total_commits": 0,
116120
"processed_commits": 0,
117121
"bad_commits": 0,
122+
"skipped_activities": 0,
118123
"total_activities": 0,
119124
}
120125

@@ -139,6 +144,7 @@ async def process_single_batch_commits(
139144
batch_info.remote,
140145
repository.segment_id,
141146
repository.integration_id,
147+
repository.parent_repo,
142148
)
143149

144150
batch_end_time = time.time()
@@ -164,6 +170,7 @@ async def process_single_batch_commits(
164170
"total_commits": self._metrics_context["total_commits"],
165171
"processed_commits": self._metrics_context["processed_commits"],
166172
"bad_commits": self._metrics_context["bad_commits"],
173+
"skipped_activities": self._metrics_context["skipped_activities"],
167174
"total_activities": self._metrics_context["total_activities"],
168175
},
169176
)
@@ -200,6 +207,7 @@ async def process_single_batch_commits(
200207
"total_commits": self._metrics_context["total_commits"],
201208
"processed_commits": self._metrics_context["processed_commits"],
202209
"bad_commits": self._metrics_context["bad_commits"],
210+
"skipped_activities": self._metrics_context["skipped_activities"],
203211
"total_activities": self._metrics_context["total_activities"],
204212
},
205213
)
@@ -609,6 +617,66 @@ def create_activities_from_commit(
609617

610618
return activities_db, activities_queue
611619

620+
async def _filter_parent_repo_activities(
621+
self,
622+
activities_db: list[tuple],
623+
activities_queue: list[dict],
624+
parent_repo: Repository,
625+
) -> tuple[list[tuple], list[dict], int]:
626+
"""
627+
Filter out activities that exist in parent repo (for fork detection).
628+
Uses full dedup key (timestamp, platform, type, sourceId, channel, segmentId) for optimal index usage.
629+
630+
Args:
631+
activities_db: List of activity tuples for database
632+
activities_queue: List of activity dicts for Kafka queue
633+
parent_repo: Parent repository information
634+
635+
Returns:
636+
Tuple of (filtered_activities_db, filtered_activities_queue, skipped_activities_count)
637+
"""
638+
if not activities_db:
639+
return activities_db, activities_queue, 0
640+
641+
# Extract (timestamp, type, sourceId) for each activity to use full dedup index
642+
activity_keys = []
643+
for act in activities_db:
644+
data = orjson.loads(act[2])["data"]
645+
activity_keys.append((data["timestamp"], data["type"], data["sourceId"]))
646+
647+
# Batch check which activities exist in parent repo
648+
parent_source_ids = await batch_check_parent_activities(
649+
activity_keys,
650+
parent_repo.url,
651+
parent_repo.segment_id,
652+
)
653+
654+
if not parent_source_ids:
655+
return activities_db, activities_queue, 0
656+
657+
filtered_activities_db = []
658+
filtered_activities_queue = []
659+
skipped_activities_count = 0
660+
661+
for i, activity_tuple in enumerate(activities_db):
662+
activity_data = orjson.loads(activity_tuple[2])
663+
source_id = activity_data["data"]["sourceId"]
664+
665+
if source_id not in parent_source_ids:
666+
# Activity doesn't exist in parent repo, keep it
667+
filtered_activities_db.append(activity_tuple)
668+
filtered_activities_queue.append(activities_queue[i])
669+
else:
670+
# Activity exists in parent repo, skip it
671+
skipped_activities_count += 1
672+
673+
if skipped_activities_count > 0:
674+
self.logger.info(
675+
f"Filtered out {skipped_activities_count} activities from parent repo {parent_repo.url}"
676+
)
677+
678+
return filtered_activities_db, filtered_activities_queue, skipped_activities_count
679+
612680
async def process_commits_chunk(
613681
self,
614682
commit_texts_chunk: list[str | None],
@@ -617,6 +685,7 @@ async def process_commits_chunk(
617685
remote: str,
618686
segment_id: str,
619687
integration_id: str,
688+
parent_repo: Repository | None,
620689
) -> None:
621690
"""
622691
Process a chunk of raw commit texts into activities and write them to DB and Kafka.
@@ -674,15 +743,31 @@ async def process_commits_chunk(
674743
del commit_lines
675744
del numstats_text
676745

677-
self.logger.info(
678-
f"Processed {processed_commits} commits, skipped {bad_commits} invalid commits in {repo_path}"
679-
)
746+
# Filter out activities from parent repo (for forks)
747+
skipped_activities = 0
748+
if parent_repo:
749+
(
750+
activities_db,
751+
activities_queue,
752+
skipped_activities,
753+
) = await self._filter_parent_repo_activities(
754+
activities_db, activities_queue, parent_repo
755+
)
680756

757+
if skipped_activities > 0:
758+
self.logger.info(
759+
f"Processed {processed_commits} commits, skipped {bad_commits} invalid commits, filtered {skipped_activities} activities from parent repo in {repo_path}"
760+
)
761+
else:
762+
self.logger.info(
763+
f"Processed {processed_commits} commits, skipped {bad_commits} invalid commits in {repo_path}"
764+
)
681765
# Update metrics context
682766
if self._metrics_context:
683767
self._metrics_context["processed_commits"] += processed_commits
684768
self._metrics_context["bad_commits"] += bad_commits
685769
self._metrics_context["total_activities"] += len(activities_db)
770+
self._metrics_context["skipped_activities"] += skipped_activities
686771

687772
# Write activities to database and queue
688773
if activities_db:
@@ -701,6 +786,7 @@ async def _process_activities_from_commits(
701786
remote: str,
702787
segment_id: str,
703788
integration_id: str,
789+
parent_repo: Repository | None = None,
704790
):
705791
"""
706792
Parse raw git log output, process commits into activities, and save to database.
@@ -747,6 +833,7 @@ async def process_single_chunk(chunk_start_idx: int, chunk_end_idx: int):
747833
remote,
748834
segment_id,
749835
integration_id,
836+
parent_repo,
750837
)
751838
completed_chunks += 1
752839
self.logger.info(f"Progress: {completed_chunks}/{total_chunks} chunks")

0 commit comments

Comments
 (0)