From 35cda5f6aff54b62ec8e2d9a123f899700b60323 Mon Sep 17 00:00:00 2001 From: Xiaohang Fu <564632204@qq.com> Date: Thu, 17 Jun 2021 09:13:44 +0800 Subject: [PATCH 1/2] add beike spider --- beike/beike.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 beike/beike.py diff --git a/beike/beike.py b/beike/beike.py new file mode 100644 index 0000000..cd5a64a --- /dev/null +++ b/beike/beike.py @@ -0,0 +1,89 @@ +""" +info: +author:Forest216 +github:https://github.com/Forest216/ +update_time:2021-6-16 +""" + +import requests +from bs4 import BeautifulSoup + + +header = { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36', + 'referer': 'https://nj.zu.ke.com/' +} + + +def get_url(): #获取url地址 切换城市只需要将nj换为目标城市的缩写即可 + url_base='https://nj.zu.ke.com/zufang/pg' + lists = [] + for i in range(1,11): + lists.append(url_base+str(i)) + return lists + + +def get_info(target_url): + house_list = [] + + html = requests.get(target_url, headers=header) + html_bs = BeautifulSoup(html.text, "lxml") + goods_div = html_bs.find_all('div', class_='content__list--item') + for good in goods_div: + good_temp = {} + + #获取价格 + price_div = good.find_all('span', class_='content__list--item-price') + price_i = price_div[0].find_all('em') + price = price_i[0].text + good_temp['price'] = price + + #获取标题 + title_div = good.find_all('p', class_='content__list--item--title') + title_em = title_div[0].find_all('a') + title = title_em[0].text.replace(' ', '').replace('\n', '') + good_temp['title'] = title + + area_div = good.find_all('p', class_='content__list--item--des') + + detail = area_div[0].text.replace(' ', '').replace('\n', '') # 江宁-百家湖-朗诗玲珑屿/84.50㎡/南/3室1厅1卫/低楼层(32层) + area = detail.split('/')[0] + location_qu = area.split('-')[0] # 区划 如栖霞区 + location_big = area.split('-')[1] # 位置 如仙林 + location_small = area.split('-')[2] # 小区名 如东方天郡 + size = detail.split('/')[1][:-1] # 面积 去掉m2 + direction = detail.split('/')[2] # 朝向 + room = detail.split('/')[3] # 房间数量 x室x厅 + floor = detail.split('/')[4] # 楼层 + + good_temp['location_qu'] = location_qu + good_temp['location_big'] = location_big + good_temp['location_small'] = location_small + good_temp['size'] = size + good_temp['direction'] = direction + good_temp['room'] = room + good_temp['floor'] = floor + + #图片地址 + image_div = good.find_all('a', class_='content__list--item--aside') + image_img = image_div[0].find_all('img') + image = image_img[0].get('data-src') + good_temp['image'] = image + + #租房页 + link_div = good.find_all('a', class_='content__list--item--aside') + link = 'https://nj.zu.ke.com' + link_div[0]['href'] + good_temp['link'] = link + + house_list.append(good_temp) + print(good_temp) + + + +if __name__=='__main__': + url_lists=get_url() + for url in url_lists: + get_info(url) + + + From 5deb73041d32d03e83394a941cee75c6889b5a59 Mon Sep 17 00:00:00 2001 From: Forest216 <564632204@qq.com> Date: Sat, 24 Jul 2021 11:14:42 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=E9=83=A8?= =?UTF-8?q?=E5=88=86=E4=BF=A1=E6=81=AF=E8=A7=A3=E6=9E=90=E5=A4=B1=E8=B4=A5?= =?UTF-8?q?bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- beike/beike.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/beike/beike.py b/beike/beike.py index cd5a64a..c56725c 100644 --- a/beike/beike.py +++ b/beike/beike.py @@ -11,11 +11,11 @@ header = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36', - 'referer': 'https://nj.zu.ke.com/' + 'referer': 'https://www.jd.com/' } -def get_url(): #获取url地址 切换城市只需要将nj换为目标城市的缩写即可 +def get_url(): #获取url地址 切换城市只需要将nj换为目标城市的缩写即可 如nj南京 sh上海 gz广州 hz杭州 bj北京 wh武汉等待 url_base='https://nj.zu.ke.com/zufang/pg' lists = [] for i in range(1,11): @@ -23,11 +23,11 @@ def get_url(): #获取url地址 切换城市只需要将nj换为目标城市的 return lists -def get_info(target_url): +def get_info(target_url): #返回一个dict型的list,包含各种信息 house_list = [] html = requests.get(target_url, headers=header) - html_bs = BeautifulSoup(html.text, "lxml") + html_bs = BeautifulSoup(html.text, "html5lib") goods_div = html_bs.find_all('div', class_='content__list--item') for good in goods_div: good_temp = {} @@ -48,6 +48,10 @@ def get_info(target_url): detail = area_div[0].text.replace(' ', '').replace('\n', '') # 江宁-百家湖-朗诗玲珑屿/84.50㎡/南/3室1厅1卫/低楼层(32层) area = detail.split('/')[0] + if area=='精选': + area = detail.split('/')[1] + if '-' not in area or len(area.split('-'))<3: + continue location_qu = area.split('-')[0] # 区划 如栖霞区 location_big = area.split('-')[1] # 位置 如仙林 location_small = area.split('-')[2] # 小区名 如东方天郡