爬虫Spider

该爬虫分为两部分,分别为Spider.py和model.py

Spider.py

该文件主要业务逻辑是调用Selenium来通过自动化测试的方法实现模拟人的行为来对网页进行请求,并将请求到的HTML提取为文本,为后续的数据抽取做铺地。需要的第三方工具包Selenium,scrapy,time

spider.py具体实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from selenium import webdriver  # 使用chromedriver必需
from scrapy import Selector # 使用Selector
import time # TimeSleep休眠
from Interests.models import * #表示models文件处于Interests文件夹下
from selenium.common.exceptions import *

domin = "https://detail.zol.com.cn/motherboard"
broswer = webdriver.Chrome(executable_path="C:\MyApplications\chromedriver.exe")

url_list = [] # 设置列表用于存放product_url

def Get_Url():
broswer.get(domin)
sel = Selector(text=broswer.page_source)
info = sel.xpath("//ul[@class='clearfix']/li/a/@href").extract()
for i in info:
product_url = 'https://detail.zol.com.cn' + i
print(product_url)
url_list.append(product_url)

def Get_Info():
for a in url_list:
broswer.get(a)
detail = Selector(text=broswer.page_source) # 获取到单个商品的详情页面
name = detail.xpath("//h1/text()").extract()[0]
core = detail.xpath("//ul/li[1]/p[1]/text()").extract()[0]
radio_core = detail.xpath("//ul/li[1]/p[2]/text()").extract()[0]
ram_type = detail.xpath("//ul/li[2]/p[1]/text()").extract()[0]
max_ram = detail.xpath("//ul/li[2]/p[2]/text()").extract()[0]
appear_size = detail.xpath("//ul/li[3]/p[1]/text()").extract()[0]
broad_type = detail.xpath("//ul/li[3]/p[2]/text()").extract()[0]
energy_socket = detail.xpath("//ul/li[4]/p[1]/text()").extract()[0]
charge_mode = detail.xpath("//ul/li[4]/p[2]/text()").extract()[0]

info = Info()
info.name = name
info.core = core
info.radio_core = radio_core
info.ram_type = ram_type
info.max_ram = max_ram
info.appear_size = appear_size
info.broad_type = broad_type
info.energy_socket = energy_socket
info.charge_mode = charge_mode

info.save(force_insert=True)

if __name__ == "__main__":
t1 = Get_Url()
t2 = Get_Info()

time.sleep(3)
t1.start()
t2.start()

models.py

models.py的主要功能是链接数据库,但是数据库中的表格不必是提前建立好的。通过peewee第三方工具包创建一个指定数据库的基础模型类。其指定数据库中数据类型可以为CharField,TextField,DateField等。

models具体代码实现
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from peewee import *

db = MySQLDatabase("demo", host="127.0.0.1", port=3306, user="root", password="123456")

# 创建一个指定数据库的基础模型类。
# 定义一个建立数据库连接的基础模型类,这样就不必为后续模型指定数据库
class BaseModel(Model):
class Meta:
database = db

class Info(BaseModel):
name = CharField()
core = CharField()
radio_core = TextField()
ram_type = TextField()
max_ram = CharField()
appear_size = TextField()
broad_type = TextField()
energy_socket = TextField()
charge_mode = TextField()

if __name__ == "__main__":
db.create_tables([Info])

y