python爬虫(抓取异步数据)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import requests
import random
import sqlite3
import os
import json

class asyncsplider:
# 初始化,建立数据库连接
def __init__(self):
db_path = 'book.sqlite' # sqlite3文件
if not os.path.exists(db_path):
conn = sqlite3.connect(db_path) # 创建sqlite3数据库
c = conn.cursor() # 获取cursor对象
c.execute('''CREATE TABLE comment
(
comment_id INTEGER PRIMARY KEY,
comment TEXT ,
nickname CHAR(50) ,
comment_time CHAR(50)
);
''')
conn.commit() # 提交生效
conn.close()
print('数据库创建成功')
self.conn = sqlite3.connect(db_path)
self.c = self.conn.cursor()

# 关闭数据库连接
def close_connect(self):
self.conn.close()

# 生成请求头
def get_headers(self):
agent_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
]
user_agent = random.choice(agent_list)
headers = {'User-Agent': user_agent, 'Accept-Language':'zh-cn,zh;q=0.5'}
return headers

# 使用代理
def get_proxies(self):
http_proxies = [
{'http:':'183.166.139.42:9999'},
{'http:':'183.166.70.83:9999'},
{'http:':'171.35.174.28:9999'}
]
proxies = random.choice(http_proxies)
return proxies

# 发起访问请求
def get_data(self, url):
headers = self.get_headers()
proxies = self.get_proxies()
try:
response = requests.get(url, headers=headers, proxies=proxies)
status_code = response.status_code
# 按iso-8859-1格式编码指按原样把字符转换成字节流
data = str(response.content, encoding='iso-8859-1')
# 内容替换, 使其成为JSON格式
data = data.replace('fetchJSON_comment98(','') # 掐头
data = data.replace(');','') # 去尾
data = data.replace('true','"true"')
data = data.replace('false','"false"')
data = data.replace('null','"null"')
json_obj = json.loads(data)
# print(json_obj)
return json_obj, status_code
except Exception as e:
print(str(e))
return None, None

# json解析
def extraction(self, json_obj):
comments = json_obj['comments']
for i in range(len(comments)):
try:
comment = comments[i]['content'].encode(encoding='iso-8859-1').decode('GB18030') # 评论内容
nickname = comments[i]['nickname'].encode(encoding='iso-8859-1').decode('GB18030') # 昵称
comment_time = comments[i]['creationTime'] # 评论时间

self.save(comment,nickname,comment_time)
except Exception as e:
print(str(e))

# 存储数据
def save(self,comment,nickname,comment_time):
self.c.execute("INSERT INTO comment (comment,nickname,comment_time)\
VALUES ('"+comment+"','"
+nickname+"','"
+comment_time+"');"
)
self.conn.commit()

def run(self, url):
json_obj, status_code = self.get_data(url)
if status_code == 200:
self.extraction(json_obj)

if __name__ == '__main__':
splider = asyncsplider()

for i in range(0, 4):
url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=12911704&score=0&sortType=5&page='+str(i)+'&pageSize=10&isShadowSku=0&rid=0&fold=1'
splider.run(url)
splider.close_connect()
print('抓取完成!')
0%