python爬虫(基础版)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import requests
import random
from bs4 import BeautifulSoup
import sqlite3
import os

class basicsplider:
# 初始化,建立数据库连接
def __init__(self):
db_path = 'fang_data.sqlite' # sqlite3文件
if not os.path.exists(db_path):
conn = sqlite3.connect(db_path) # 创建sqlite3数据库
c = conn.cursor() # 获取cursor对象
c.execute('''CREATE TABLE ershoufang
(
fang_id INTEGER PRIMARY KEY,
fang_title CHAR(100) ,
fang_flood CHAR(50) ,
fang_layout CHAR(50) ,
fang_area CHAR(50) ,
fang_orientation CHAR(50) ,
fang_built CHAR(50) ,
fang_total CHAR(50) ,
fang_unit CHAR(50) ,
fang_followers CHAR(50) ,
fang_publish CHAR(50)
);
''')
conn.commit() # 提交生效
conn.close()
print('数据库创建成功')
self.conn = sqlite3.connect(db_path)
self.c = self.conn.cursor()

# 关闭数据库连接
def close_connect(self):
self.conn.close()

# 生成请求头
def get_headers(self):
agent_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
]
user_agent = random.choice(agent_list)
headers = {'User-Agent': user_agent, 'Accept-Language':'zh-cn,zh;q=0.5'}
return headers

# 使用代理
def get_proxies(self):
http_proxies = [
{'http:':'183.166.139.42:9999'},
{'http:':'183.166.70.83:9999'},
{'http:':'171.35.174.28:9999'}
]
proxies = random.choice(http_proxies)
return proxies

# 发起访问请求
def get_data(self, url):
headers = self.get_headers()
proxies = self.get_proxies()
try:
response = requests.get(url, headers=headers, proxies=proxies)
status_code = response.status_code
data = response.text.encode('utf-8')
# print(status_code)
soup = BeautifulSoup(data, 'lxml')
# print(soup.prettify()) # 输出格式化的html代码
return soup, status_code
except Exception as e:
print(str(e))
return None, None

# html解析
def extraction(self, soup):
fang_all = soup.find_all('div', class_="info clear")
for fang in fang_all:
try:
fang_title = fang.find('div',class_="title").a.text.strip() # 房屋挂牌标语
fang_flood = fang.find('div',class_="flood").a.text.strip() # 所在小区

fang_address = fang.find('div',class_="address").div.text.strip()
fang_layout = fang_address.split('|')[0].strip() # 户型
fang_area = fang_address.split('|')[1].strip() # 面积
fang_orientation = fang_address.split('|')[2].strip() # 朝向
fang_built = fang_address.split('|')[5].strip() # 建筑年代
if fang_built[-1:] != '建':
fang_built = ''

fang_priceinfo = fang.find('div',class_="priceInfo")
fang_total = fang_priceinfo.find('div',class_="totalPrice").text.strip() # 房屋总价
fang_unit = fang_priceinfo.find('div',class_="unitPrice").span.text.strip() # 房屋单价

fang_followinfo = fang.find('div',class_="followInfo").text
fang_followers = fang_followinfo.split('/')[0].strip() # 关注人数
fang_publish = fang_followinfo.split('/')[1].strip() # 发布时间

self.save(fang_title,fang_flood,fang_layout,fang_area,fang_orientation,fang_built,fang_unit,fang_total,fang_followers,fang_publish)
except Exception as e:
print(str(e))

# 存储数据
def save(self,fang_title,fang_flood,fang_layout,fang_area,fang_orientation,fang_built,fang_followers,fang_publish,fang_unit,fang_total):
self.c.execute("INSERT INTO ershoufang (fang_title,fang_flood,fang_layout,fang_area,fang_orientation,fang_built,fang_unit,fang_total,fang_followers,fang_publish)\
VALUES ('"+fang_title+"','"
+fang_flood+"','"
+fang_layout+"','"
+fang_area+"','"
+fang_orientation+"','"
+fang_built+"','"
+fang_unit+"','"
+fang_total+"','"
+fang_followers+"','"
+fang_publish+"');"
)
self.conn.commit()

def run(self, url):
soup, status_code = self.get_data(url)
if status_code == 200:
self.extraction(soup)

if __name__ == '__main__':
splider = basicsplider()

for i in range(1, 5):
url = 'https://bj.lianjia.com/ershoufang/chaoyang/pg'+str(i)+'/'
splider.run(url)
splider.close_connect()
print('抓取完成!')
0%