1+ import logging
2+ import requests
3+ import threading
4+ from time import sleep
5+ from random import choice
6+ from bs4 import BeautifulSoup
7+ from unidecode import unidecode
8+ from urllib .parse import urlparse
9+ from crosslinked .logger import Log
10+ from datetime import datetime , timedelta
11+ from urllib3 import disable_warnings , exceptions
12+
13+ disable_warnings (exceptions .InsecureRequestWarning )
14+ logging .getLogger ("urllib3" ).setLevel (logging .WARNING )
15+ csv = logging .getLogger ('cLinked_csv' )
16+
17+
18+ class Timer (threading .Thread ):
19+ def __init__ (self , timeout ):
20+ threading .Thread .__init__ (self )
21+ self .start_time = None
22+ self .running = None
23+ self .timeout = timeout
24+
25+ def run (self ):
26+ self .running = True
27+ self .start_time = datetime .now ()
28+ logging .debug ("Thread Timer: Started" )
29+
30+ while self .running :
31+ if (datetime .now () - self .start_time ) > timedelta (seconds = self .timeout ):
32+ self .stop ()
33+ sleep (0.05 )
34+
35+ def stop (self ):
36+ logging .debug ("Thread Timer: Stopped" )
37+ self .running = False
38+
39+
40+ class CrossLinked :
41+ def __init__ (self , search_engine , target , timeout , conn_timeout = 3 , proxies = [], jitter = 0 ):
42+ self .results = []
43+ self .url = {'google' : 'https://www.google.com/search?q=site:linkedin.com/in+"{}"&num=100&start={}' ,
44+ 'bing' : 'http://www.bing.com/search?q="{}"+site:linkedin.com/in&first={}' }
45+
46+ self .runtime = datetime .now ().strftime ('%m-%d-%Y %H:%M:%S' )
47+ self .search_engine = search_engine
48+ self .conn_timeout = conn_timeout
49+ self .timeout = timeout
50+ self .proxies = proxies
51+ self .target = target
52+ self .jitter = jitter
53+
54+ def search (self ):
55+ search_timer = Timer (self .timeout )
56+ search_timer .start ()
57+
58+ while search_timer .running :
59+ try :
60+ url = self .url [self .search_engine ].format (self .target , len (self .results ))
61+ resp = web_request (url , self .conn_timeout , self .proxies )
62+ http_code = get_statuscode (resp )
63+
64+ if http_code != 200 :
65+ Log .info ("{:<3} {} ({})" .format (len (self .results ), url , http_code ))
66+ Log .warn ('None 200 response, exiting search ({})' .format (http_code ))
67+ break
68+
69+ self .page_parser (resp )
70+ Log .info ("{:<3} {} ({})" .format (len (self .results ), url , http_code ))
71+
72+ sleep (self .jitter )
73+ except KeyboardInterrupt :
74+ Log .warn ("Key event detected, exiting search..." )
75+ break
76+
77+ search_timer .stop ()
78+ return self .results
79+
80+ def page_parser (self , resp ):
81+ for link in extract_links (resp ):
82+ try :
83+ self .results_handler (link )
84+ except Exception as e :
85+ Log .warn ('Failed Parsing: {}- {}' .format (link .get ('href' ), e ))
86+
87+ def link_parser (self , url , link ):
88+ u = {'url' : url }
89+ u ['text' ] = unidecode (link .text .split ("|" )[0 ].split ("..." )[0 ]) # Capture link text before trailing chars
90+ u ['title' ] = self .parse_linkedin_title (u ['text' ]) # Extract job title
91+ u ['name' ] = self .parse_linkedin_name (u ['text' ]) # Extract whole name
92+ return u
93+
94+ def parse_linkedin_title (self , data ):
95+ try :
96+ title = data .split ("-" )[1 ].split ('https:' )[0 ]
97+ return title .split ("..." )[0 ].split ("|" )[0 ].strip ()
98+ except :
99+ return 'N/A'
100+
101+ def parse_linkedin_name (self , data ):
102+ try :
103+ name = data .split ("-" )[0 ].strip ()
104+ return unidecode (name )
105+ except :
106+ return False
107+
108+ def results_handler (self , link ):
109+ url = str (link .get ('href' )).lower ()
110+
111+ if not extract_subdomain (url ).endswith ('linkedin.com' ):
112+ return False
113+ elif 'linkedin.com/in' not in url :
114+ return False
115+
116+ data = self .link_parser (url , link )
117+ self .log_results (data ) if data ['name' ] else False
118+
119+
120+ def log_results (self , d ):
121+ # Prevent Duplicates & non-standard responses (i.e: "<span>linkedin.com</span></a>")
122+ if d in self .results :
123+ return
124+ elif 'linkedin.com' in d ['name' ]:
125+ return
126+
127+ self .results .append (d )
128+ # Search results are logged to names.csv but names.txt is not generated until end to prevent duplicates
129+ logging .debug ('name: {:25} RawTxt: {}' .format (d ['name' ], d ['text' ]))
130+ csv .info ('"{}","{}","{}","{}","{}","{}",' .format (self .runtime , self .search_engine , d ['name' ], d ['title' ], d ['url' ], d ['text' ]))
131+
132+
133+ def get_statuscode (resp ):
134+ try :
135+ return resp .status_code
136+ except :
137+ return 0
138+
139+
140+ def get_proxy (proxies ):
141+ tmp = choice (proxies ) if proxies else False
142+ return {"http" : tmp , "https" : tmp } if tmp else {}
143+
144+
145+ def get_agent ():
146+ return choice ([
147+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0'
148+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 12.5; rv:104.0) Gecko/20100101 Firefox/104.0' ,
149+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36' ,
150+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36' ,
151+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' ,
152+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' ,
153+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' ,
154+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15' ,
155+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15'
156+ ])
157+
158+
159+ def web_request (url , timeout = 3 , proxies = [], ** kwargs ):
160+ try :
161+ s = requests .Session ()
162+ r = requests .Request ('GET' , url , headers = {'User-Agent' : get_agent ()}, cookies = {'CONSENT' : 'YES' }, ** kwargs )
163+ p = r .prepare ()
164+ return s .send (p , timeout = timeout , verify = False , proxies = get_proxy (proxies ))
165+ except requests .exceptions .TooManyRedirects as e :
166+ Log .fail ('Proxy Error: {}' .format (e ))
167+ except :
168+ pass
169+ return False
170+
171+
172+ def extract_links (resp ):
173+ links = []
174+ soup = BeautifulSoup (resp .content , 'lxml' )
175+ for link in soup .findAll ('a' ):
176+ links .append (link )
177+ return links
178+
179+
180+ def extract_subdomain (url ):
181+ return urlparse (url ).netloc
0 commit comments