Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions services/apps/script_executor_worker/src/activities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@ import {
blockMemberOrganizationAffiliation,
getOrganizationMembers,
} from './activities/block-organization-affiliation'
import {
findDuplicateMembersAfterDate,
moveMemberActivityRelations,
} from './activities/cleanup/duplicate-members'
import { deleteMember, getMembersToCleanup, syncRemoveMember } from './activities/cleanup/member'
import {
deleteOrganization,
Expand Down Expand Up @@ -44,6 +40,10 @@ import {
findMembersWithSameVerifiedEmailsInDifferentPlatforms,
} from './activities/merge-members-with-similar-identities'
import { getUnprocessedLLMApprovedSuggestions } from './activities/process-llm-verified-merges'
import {
getOrganizationsToPrune,
pruneOrganization,
} from './activities/prune-duplicate-organizations'
import { deleteIndexedEntities } from './activities/sync/entity-index'
import { getMembersForSync, syncMembersBatch } from './activities/sync/member'
import { getOrganizationsForSync, syncOrganizationsBatch } from './activities/sync/organization'
Expand Down Expand Up @@ -78,12 +78,12 @@ export {
deleteIndexedEntities,
getUnprocessedLLMApprovedSuggestions,
getWorkflowsCount,
findDuplicateMembersAfterDate,
moveMemberActivityRelations,
getBotMembersWithOrgAffiliation,
removeBotMemberOrganization,
unlinkOrganizationFromBotActivities,
blockMemberOrganizationAffiliation,
getOrganizationMembers,
calculateMemberAffiliations,
getOrganizationsToPrune,
pruneOrganization,
}

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import OrganizationRepository from '@crowd/data-access-layer/src/old/apps/script_executor_worker/organization.repo'

import { svc } from '../main'

export async function pruneOrganization(orgId: string): Promise<void> {
try {
const orgRepo = new OrganizationRepository(svc.postgres.writer.connection(), svc.log)
await orgRepo.pruneOrganization(orgId)
} catch (error) {
svc.log.error(error, 'Error pruning organization in database!')
throw error
}
}

export async function getOrganizationsToPrune(
batchSize: number,
): Promise<{ id: string; displayName: string }[]> {
try {
const orgRepo = new OrganizationRepository(svc.postgres.reader.connection(), svc.log)
return orgRepo.getOrganizationsToPrune(batchSize)
} catch (error) {
svc.log.error(error, 'Error getting organizations to prune!')
throw error
}
}
6 changes: 0 additions & 6 deletions services/apps/script_executor_worker/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,6 @@ export interface IProcessLLMVerifiedMergesArgs extends IScriptBatchTestArgs {
type: string
}

export interface ICleanupDuplicateMembersArgs extends IScriptBatchTestArgs {
cutoffDate?: string
checkByActivityIdentity?: boolean
checkByTwitterIdentity?: boolean
}

export interface IDedupActivityRelationsArgs extends IScriptBatchTestArgs {
groupsPerRun?: number
cursor?: Omit<IActivityRelationDuplicateGroup, 'activityIds'>
Expand Down
4 changes: 2 additions & 2 deletions services/apps/script_executor_worker/src/workflows.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import { blockOrganizationAffiliation } from './workflows/block-organization-affiliation'
import { cleanupDuplicateMembers } from './workflows/cleanup/duplicate-members'
import { cleanupMembers } from './workflows/cleanup/members'
import { cleanupOrganizations } from './workflows/cleanup/organizations'
import { dissectMember } from './workflows/dissectMember'
Expand All @@ -8,6 +7,7 @@ import { findAndMergeMembersWithSameVerifiedEmailsInDifferentPlatforms } from '.
import { fixBotMembersAffiliation } from './workflows/fix-bot-members-affiliation'
import { fixOrgIdentitiesWithWrongUrls } from './workflows/fixOrgIdentitiesWithWrongUrls'
import { processLLMVerifiedMerges } from './workflows/processLLMVerifiedMerges'
import { pruneDuplicateOrganizations } from './workflows/prune-duplicate-organizations'
import { syncMembers } from './workflows/sync/members'
import { syncOrganizations } from './workflows/sync/organizations'

Expand All @@ -21,7 +21,7 @@ export {
cleanupMembers,
cleanupOrganizations,
processLLMVerifiedMerges,
cleanupDuplicateMembers,
pruneDuplicateOrganizations,
fixBotMembersAffiliation,
blockOrganizationAffiliation,
}

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import { proxyActivities } from '@temporalio/workflow'

import * as activities from '../activities'
import { IScriptBatchTestArgs } from '../types'
import { chunkArray } from '../utils/common'

const { getOrganizationsToPrune, pruneOrganization, syncRemoveOrganization } = proxyActivities<
typeof activities
>({
startToCloseTimeout: '30 minutes',
retry: { maximumAttempts: 3, backoffCoefficient: 3 },
})

export async function pruneDuplicateOrganizations(args: IScriptBatchTestArgs): Promise<void> {
const BATCH_SIZE = args.batchSize ?? 100

const organizationsToPrune = await getOrganizationsToPrune(BATCH_SIZE)

if (organizationsToPrune.length === 0) {
console.log('No more organizations to prune!')
return
}

const CHUNK_SIZE = 25

for (const chunk of chunkArray(organizationsToPrune, CHUNK_SIZE)) {
const cleanupTasks = chunk.map(async (o) => {
console.log('Pruning organization', o.displayName)
await syncRemoveOrganization(o.id)
return pruneOrganization(o.id)
})

await Promise.all(cleanupTasks).catch((err) => {
console.error('Error pruning organizations!', err)
})
}
}
Comment thread
skwowet marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,87 @@ class OrganizationRepository {
{ organizationId, limit, offset },
)
}

public async pruneOrganization(organizationId: string): Promise<void> {
const tablesToDelete = [
{ name: 'organizationNoMerge', conditions: ['organizationId', 'noMergeId'] },
{ name: 'organizationToMerge', conditions: ['organizationId', 'toMergeId'] },
{ name: 'organizationToMergeRaw', conditions: ['organizationId', 'toMergeId'] },
{ name: 'organizationEnrichmentCache', conditions: ['organizationId'] },
{ name: 'organizationEnrichments', conditions: ['organizationId'] },
{ name: 'memberSegmentAffiliations', conditions: ['organizationId'] },
{ name: 'organizationSegmentsAgg', conditions: ['organizationId'] },
{ name: 'organizationSegments', conditions: ['organizationId'] },
{ name: 'orgAttributes', conditions: ['organizationId'] },
{ name: 'organizationIdentities', conditions: ['organizationId'] },
{ name: 'memberOrganizations', conditions: ['organizationId'] },
{ name: 'organizations', conditions: ['id'] },
]

await this.connection.tx(async (tx) => {
for (const table of tablesToDelete) {
const whereClause = table.conditions
.map((field) => `"${field}" = $(organizationId)`)
.join(' OR ')
await tx.none(`DELETE FROM "${table.name}" WHERE ${whereClause}`, { organizationId })
}
})
}

public async getOrganizationsToPrune(
batchSize: number,
): Promise<{ id: string; displayName: string }[]> {
return this.connection.query(
`
WITH email_providers AS (
SELECT unnest(ARRAY[
'gmail.com', 'gmail.co.uk', 'gmail.com.au', 'gmail.com.tr',
'yahoo.com', 'yahoo.co.uk', 'yahoo.com.br', 'yahoo.co.in', 'yahoo.fr', 'yahoo.es', 'yahoo.it', 'yahoo.de', 'yahoo.ca', 'yahoo.com.au', 'yahoo.in', 'yahoo.co.jp', 'yahoo.com.ar', 'yahoo.com.mx', 'yahoo.co.id', 'yahoo.com.sg', 'yahoo.co.za', 'yahoo.com.ph', 'yahoo.com.tw', 'yahoo.com.hk', 'yahoo.com.vn',
'hotmail.com', 'hotmail.co.uk', 'hotmail.fr', 'hotmail.ca', 'hotmail.it', 'hotmail.es', 'hotmail.de', 'hotmail.com.au', 'hotmail.com.mx',
'icloud.com', 'icloud.com.cn',
'fastmail.com', 'tutanota.com', 'tuta.io',
'gmx.com', 'gmx.de', 'gmx.net', 'gmx.at', 'gmx.ch', 'gmx.fr', 'gmx.co.uk',
'aol.com', 'aol.co.uk', 'aol.fr', 'aol.de',
'msn.com', 'wanadoo.fr', 'orange.fr', 'comcast.net',
'live.com', 'live.co.uk', 'live.fr', 'live.nl', 'live.it', 'live.com.au', 'live.ca', 'live.cn',
'rediffmail.com', 'sify.com', 'indiatimes.com', 'free.fr', 'web.de',
'yandex.ru', 'yandex.com', 'yandex.com.tr', 'ya.ru',
'ymail.com', 'libero.it',
'outlook.com', 'outlook.fr', 'outlook.co.uk', 'outlook.de', 'outlook.es', 'outlook.it', 'outlook.com.au', 'outlook.com.br', 'outlook.com.mx', 'outlook.co.jp', 'outlook.in', 'outlook.com.sg', 'outlook.co.za', 'outlook.co.in',
'uol.com.br', 'bol.com.br',
'mail.ru', 'inbox.ru', 'list.ru', 'bk.ru',
'mail.com', 'mail.de', 'mail.co.uk',
'cox.net', 'sbcglobal.net', 'sfr.fr', 'verizon.net', 'googlemail.com', 'ig.com.br', 'bigpond.com', 'bigpond.net.au', 'terra.com.br', 'neuf.fr', 'alice.it', 'rocketmail.com', 'att.net', 'laposte.net', 'bellsouth.net', 'charter.net', 'rambler.ru', 'tiscali.it', 'tiscali.co.uk', 'shaw.ca', 'sky.com', 'earthlink.net', 'optonline.net', 'freenet.de', 't-online.de', 'aliceadsl.fr', 'virgilio.it', 'home.nl', 'qq.com', 'vip.qq.com', 'telenet.be', 'pandora.be', 'me.com', 'voila.fr', 'planet.nl', 'tin.it', 'ntlworld.com', 'arcor.de', 'frontiernet.net', 'hetnet.nl', 'zonnet.nl', 'club-internet.fr', 'juno.com', 'optusnet.com.au', 'blueyonder.co.uk', 'bluewin.ch', 'skynet.be', 'sympatico.ca', 'windstream.net', 'mac.com', 'centurytel.net', 'chello.nl', 'aim.com',
'protonmail.com', 'protonmail.ch', 'proton.me', 'pm.me', 'duck.com',
'zoho.com', 'zohomail.com',
'users.noreply.github.com',
'126.com', '139.com', '163.com', '188.com', 'foxmail.com', 'tom.com', '21cn.com', 'yeah.net',
'naver.com', 'daum.net', 'hanmail.net',
'hey.com', 'inbox.com', 'lycos.com', 'excite.com', 'hushmail.com', 'mailfence.com', 'mailbox.org', 'posteo.de', 'startmail.com', 'runbox.com', 'countermail.com', 'mynet.com',
'wp.pl', 'onet.pl', 'interia.pl', 'o2.pl',
'seznam.cz', 'centrum.cz',
'mailinator.com', 'guerrillamail.com', '10minutemail.com', 'tempmail.com'
]) AS provider
)
SELECT DISTINCT o.id, o."displayName"
FROM organizations o
INNER JOIN "organizationIdentities" oi
ON o.id = oi."organizationId"
INNER JOIN email_providers ep
ON LOWER(oi.value) = ep.provider
WHERE o."deletedAt" IS NULL
AND oi.type = 'primary-domain'
AND NOT EXISTS (
SELECT 1
FROM "memberOrganizations" mo
WHERE mo."organizationId" = o.id
AND (mo.title IS NOT NULL AND mo.title != '')
AND (mo.source IS NOT NULL AND mo.source NOT IN ('email-domain'))
)
Comment thread
skwowet marked this conversation as resolved.
`,
{ batchSize },
)
}
Comment thread
skwowet marked this conversation as resolved.
}

export default OrganizationRepository