From 35cda5f6aff54b62ec8e2d9a123f899700b60323 Mon Sep 17 00:00:00 2001
From: Xiaohang Fu <564632204@qq.com>
Date: Thu, 17 Jun 2021 09:13:44 +0800
Subject: [PATCH 1/2] add beike spider

---
 beike/beike.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 beike/beike.py

diff --git a/beike/beike.py b/beike/beike.py
new file mode 100644
index 0000000..cd5a64a
--- /dev/null
+++ b/beike/beike.py
@@ -0,0 +1,89 @@
+"""
+info:
+author:Forest216
+github:https://github.com/Forest216/
+update_time:2021-6-16
+"""
+
+import requests
+from bs4 import BeautifulSoup
+
+
+header = {
+    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
+    'referer': 'https://nj.zu.ke.com/'
+}
+
+
+def get_url(): #获取url地址 切换城市只需要将nj换为目标城市的缩写即可
+    url_base='https://nj.zu.ke.com/zufang/pg'
+    lists = []
+    for i in range(1,11):
+        lists.append(url_base+str(i))
+    return lists
+
+
+def get_info(target_url):
+    house_list = []
+
+    html = requests.get(target_url, headers=header)
+    html_bs = BeautifulSoup(html.text, "lxml")
+    goods_div = html_bs.find_all('div', class_='content__list--item')
+    for good in goods_div:
+        good_temp = {}
+
+        #获取价格
+        price_div = good.find_all('span', class_='content__list--item-price')
+        price_i = price_div[0].find_all('em')
+        price = price_i[0].text
+        good_temp['price'] = price
+
+        #获取标题
+        title_div = good.find_all('p', class_='content__list--item--title')
+        title_em = title_div[0].find_all('a')
+        title = title_em[0].text.replace(' ', '').replace('\n', '')
+        good_temp['title'] = title
+
+        area_div = good.find_all('p', class_='content__list--item--des')
+
+        detail = area_div[0].text.replace(' ', '').replace('\n', '')  # 江宁-百家湖-朗诗玲珑屿/84.50㎡/南/3室1厅1卫/低楼层（32层）
+        area = detail.split('/')[0]
+        location_qu = area.split('-')[0]  # 区划 如栖霞区
+        location_big = area.split('-')[1]  # 位置 如仙林
+        location_small = area.split('-')[2]  # 小区名 如东方天郡
+        size = detail.split('/')[1][:-1]  # 面积 去掉m2
+        direction = detail.split('/')[2]  # 朝向
+        room = detail.split('/')[3]  # 房间数量 x室x厅
+        floor = detail.split('/')[4]  # 楼层
+
+        good_temp['location_qu'] = location_qu
+        good_temp['location_big'] = location_big
+        good_temp['location_small'] = location_small
+        good_temp['size'] = size
+        good_temp['direction'] = direction
+        good_temp['room'] = room
+        good_temp['floor'] = floor
+
+        #图片地址
+        image_div = good.find_all('a', class_='content__list--item--aside')
+        image_img = image_div[0].find_all('img')
+        image = image_img[0].get('data-src')
+        good_temp['image'] = image
+
+        #租房页
+        link_div = good.find_all('a', class_='content__list--item--aside')
+        link = 'https://nj.zu.ke.com' + link_div[0]['href']
+        good_temp['link'] = link
+
+        house_list.append(good_temp)
+        print(good_temp)
+
+
+
+if __name__=='__main__':
+    url_lists=get_url()
+    for url in url_lists:
+        get_info(url)
+
+
+

From 5deb73041d32d03e83394a941cee75c6889b5a59 Mon Sep 17 00:00:00 2001
From: Forest216 <564632204@qq.com>
Date: Sat, 24 Jul 2021 11:14:42 +0800
Subject: [PATCH 2/2] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=E9=83=A8?=
 =?UTF-8?q?=E5=88=86=E4=BF=A1=E6=81=AF=E8=A7=A3=E6=9E=90=E5=A4=B1=E8=B4=A5?=
 =?UTF-8?q?bug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 beike/beike.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/beike/beike.py b/beike/beike.py
index cd5a64a..c56725c 100644
--- a/beike/beike.py
+++ b/beike/beike.py
@@ -11,11 +11,11 @@
 
 header = {
     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
-    'referer': 'https://nj.zu.ke.com/'
+    'referer': 'https://www.jd.com/'
 }
 
 
-def get_url(): #获取url地址 切换城市只需要将nj换为目标城市的缩写即可
+def get_url(): #获取url地址 切换城市只需要将nj换为目标城市的缩写即可 如nj南京 sh上海 gz广州 hz杭州 bj北京 wh武汉等待
     url_base='https://nj.zu.ke.com/zufang/pg'
     lists = []
     for i in range(1,11):
@@ -23,11 +23,11 @@ def get_url(): #获取url地址 切换城市只需要将nj换为目标城市的
     return lists
 
 
-def get_info(target_url):
+def get_info(target_url): #返回一个dict型的list，包含各种信息
     house_list = []
 
     html = requests.get(target_url, headers=header)
-    html_bs = BeautifulSoup(html.text, "lxml")
+    html_bs = BeautifulSoup(html.text, "html5lib")
     goods_div = html_bs.find_all('div', class_='content__list--item')
     for good in goods_div:
         good_temp = {}
@@ -48,6 +48,10 @@ def get_info(target_url):
 
         detail = area_div[0].text.replace(' ', '').replace('\n', '')  # 江宁-百家湖-朗诗玲珑屿/84.50㎡/南/3室1厅1卫/低楼层（32层）
         area = detail.split('/')[0]
+        if area=='精选':
+            area = detail.split('/')[1]
+        if '-' not in area or len(area.split('-'))<3:
+            continue
         location_qu = area.split('-')[0]  # 区划 如栖霞区
         location_big = area.split('-')[1]  # 位置 如仙林
         location_small = area.split('-')[2]  # 小区名 如东方天郡