python爬虫(模拟网页浏览器)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import sqlite3
import os
import time
from selenium import webdriver
from bs4 import BeautifulSoup

class webdriversplider:
# 初始化,建立数据库连接
def __init__(self):
db_path = 'boss.sqlite' # sqlite3文件
if not os.path.exists(db_path):
conn = sqlite3.connect(db_path) # 创建sqlite3数据库
c = conn.cursor() # 获取cursor对象
c.execute('''CREATE TABLE jobs
(
job_id INTEGER PRIMARY KEY,
job_name TEXT ,
job_area CHAR(50) ,
job_salary CHAR(50) ,
job_exp CHAR(50) ,
degree CHAR(50) ,
company_name CHAR(50) ,
company_type CHAR(50) ,
stage CHAR(50) ,
job_href TEXT
);
''')
conn.commit() # 提交生效
conn.close()
print('数据库创建成功')
self.conn = sqlite3.connect(db_path)
self.c = self.conn.cursor()

# 关闭数据库连接
def close_connect(self):
self.conn.close()

# html解析
def extraction(self, data):
for item in data.find_all(class_="job-primary"):
job_title = item.find("div", attrs={"class": "job-title"})
job_name = job_title.a.attrs["title"] # 职位名称
job_href = job_title.a.attrs["href"] # 职位链接
job_area = job_title.find(class_="job-area").text # 职位所在地

job_limit = item.find(class_="job-limit")
job_salary = job_limit.span.text # 薪酬
job_exp = job_limit.p.contents[0] # 要求工作经验
degree = job_limit.p.contents[2] # 要求学历

company = item.find(class_="info-company")
company_name = company.h3.a.text # 公司名称
company_type = company.p.a.text # 行业标签
stage = company.p.contents[2] # 发展阶段

self.save(job_name,job_area,job_salary,job_exp,degree,company_name,company_type,stage,job_href)

# 存储数据
def save(self,job_name,job_area,job_salary,job_exp,degree,company_name,company_type,stage,job_href):
self.c.execute("INSERT INTO jobs (job_name,job_area,job_salary,job_exp,degree,company_name,company_type,stage,job_href)\
VALUES ('"+job_name+"','"
+job_area+"','"
+job_salary+"','"
+job_exp+"','"
+degree+"','"
+company_name+"','"
+company_type+"','"
+stage+"','"
+job_href+"');"
)
self.conn.commit()

def run(self, url, keyword, pages):
chrome_driver = "D:\chromedriver.exe"
driver = webdriver.Chrome(executable_path=chrome_driver)
driver.get(url)

# 获取到查询框 输入查询条件
driver.find_element_by_name("query").send_keys(keyword)
# 点击查询按钮
driver.find_element_by_class_name("btn-search").click()

this_page = 1
while this_page <= pages:
this_page += 1
time.sleep(5)
data = driver.execute_script("return document.documentElement.outerHTML")
data = BeautifulSoup(data, "html.parser")
self.extraction(data)
next_page = data.find(class_="next")
if next_page:
driver.find_element_by_class_name("next").click()
else:
break

driver.close()

if __name__ == '__main__':
splider = webdriversplider()

url = 'https://www.zhipin.com/'
keyword = '数据分析兼职'

splider.run(url, keyword, 5)
splider.close_connect()
print('抓取完成!')
0%