jdspider

先占个位置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import requests
from lxml import etree
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
import re

def getHTMLText(goods):
url = 'https://search.jd.com/Search?keyword='+ goods+ '&enc=utf-8'
head={'authority': 'search.jd.com',
'method': 'GET',
'path': '/s_new.php?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=4&s=84&scrolling=y&log_id=1529828108.22071&tpl=3_M&show_items=7651927,7367120,7056868,7419252,6001239,5934182,4554969,3893501,7421462,6577495,26480543553,7345757,4483120,6176077,6932795,7336429,5963066,5283387,25722468892,7425622,4768461',
'scheme': 'https',
'referer': 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=3&s=58&click=0',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'Cookie':'qrsc=3; pinId=RAGa4xMoVrs; xtest=1210.cf6b6759; ipLocation=%u5E7F%u4E1C; _jrda=5; TrackID=1aUdbc9HHS2MdEzabuYEyED1iDJaLWwBAfGBfyIHJZCLWKfWaB_KHKIMX9Vj9_2wUakxuSLAO9AFtB2U0SsAD-mXIh5rIfuDiSHSNhZcsJvg; shshshfpa=17943c91-d534-104f-a035-6e1719740bb6-1525571955; shshshfpb=2f200f7c5265e4af999b95b20d90e6618559f7251020a80ea1aee61500; cn=0; 3AB9D23F7A4B3C9B=QFOFIDQSIC7TZDQ7U4RPNYNFQN7S26SFCQQGTC3YU5UZQJZUBNPEXMX7O3R7SIRBTTJ72AXC4S3IJ46ESBLTNHD37U; ipLoc-djd=19-1607-3638-3638.608841570; __jdu=930036140; user-key=31a7628c-a9b2-44b0-8147-f10a9e597d6f; areaId=19; __jdv=122270672|direct|-|none|-|1529893590075; PCSYCityID=25; mt_xid=V2_52007VwsQU1xaVVoaSClUA2YLEAdbWk5YSk9MQAA0BBZOVQ0ADwNLGlUAZwQXVQpaAlkvShhcDHsCFU5eXENaGkIZWg5nAyJQbVhiWR9BGlUNZwoWYl1dVF0%3D; __jdc=122270672; shshshfp=72ec41b59960ea9a26956307465948f6; rkv=V0700; __jda=122270672.930036140.-.1529979524.1529984840.85; __jdb=122270672.1.930036140|85.1529984840; shshshsID=f797fbad20f4e576e9c30d1c381ecbb1_1_1529984840145'
}
try:
r =requests.get(url,headers = head ,timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""


def searchGoods(brand):
soup = BeautifulSoup(brand,'html.parser')
data1 = soup.find('ul',{"class":"J_valueList v-fixed"})
datali =data1.find_all('li')
Goods_href=[]
Goods_name=[]
for li in datali:
Goods_name.append(li.a.attrs['title'])
print(li.a.attrs['title'])
Goods_href.append(li.a.attrs['href'])
count = 0
for j in range(len(Goods_href)):
print("<<<{}.".format(count+1),"品牌 :"+Goods_name[j])
count = count+1
judge = 1
while(judge):
Goods_num = input("请输入品牌对应序号:")
if Goods_num.isdigit():
judge = 0
else:
print("您的输入有误,请输入数字:")
continue
a = int(Goods_num)
if a>count:
print("输入序号过大,请重新输入:")
judge = 1
elif a<1:
print("输入序号过小,请重新输入:")
judge = 1
print("选择的品牌是: "+Goods_name[int(Goods_num)-1])
brand_url = "https://search.jd.com/"+Goods_href[int(Goods_num)-1]
return brand_url

def orderBy(brand_url):
judge = 1
while(judge):
kind = input("按照:综合 / 销量 / 评论数 / 新品 / 价格 进行排序(默认综合)")
strinfo =re.compile('uc=0#J_searchWrap')#在对网页的url进行分析的时候发现
uc=0#J_searchWrap可以删减,如果点击不同的话对应的知识psort的值不同
if kind == '综合':
judge = 0
return 0
if kind == '销量':
b = strinfo.sub('psort=3',brand_url)
judge = 0
elif kind =='评论数':
b = strinfo.sub('psort=4',brand_url)
judge = 0
elif kind =='新品':
b = strinfo.sub('psort=5',brand_url)
judge = 0
elif kind =='价格':
b = strinfo.sub('psort=2',brand_url)
judge = 0
else :
print("输入有误,请重新输入:")
return b

def focus_good(new_brand_url):
head={'authority': 'search.jd.com',
'method': 'GET',
'path': '/s_new.php?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=4&s=84&scrolling=y&log_id=1529828108.22071&tpl=3_M&show_items=7651927,7367120,7056868,7419252,6001239,5934182,4554969,3893501,7421462,6577495,26480543553,7345757,4483120,6176077,6932795,7336429,5963066,5283387,25722468892,7425622,4768461',
'scheme': 'https',
'referer': 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=3&s=58&click=0',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'Cookie':'qrsc=3; pinId=RAGa4xMoVrs; xtest=1210.cf6b6759; ipLocation=%u5E7F%u4E1C; _jrda=5; TrackID=1aUdbc9HHS2MdEzabuYEyED1iDJaLWwBAfGBfyIHJZCLWKfWaB_KHKIMX9Vj9_2wUakxuSLAO9AFtB2U0SsAD-mXIh5rIfuDiSHSNhZcsJvg; shshshfpa=17943c91-d534-104f-a035-6e1719740bb6-1525571955; shshshfpb=2f200f7c5265e4af999b95b20d90e6618559f7251020a80ea1aee61500; cn=0; 3AB9D23F7A4B3C9B=QFOFIDQSIC7TZDQ7U4RPNYNFQN7S26SFCQQGTC3YU5UZQJZUBNPEXMX7O3R7SIRBTTJ72AXC4S3IJ46ESBLTNHD37U; ipLoc-djd=19-1607-3638-3638.608841570; __jdu=930036140; user-key=31a7628c-a9b2-44b0-8147-f10a9e597d6f; areaId=19; __jdv=122270672|direct|-|none|-|1529893590075; PCSYCityID=25; mt_xid=V2_52007VwsQU1xaVVoaSClUA2YLEAdbWk5YSk9MQAA0BBZOVQ0ADwNLGlUAZwQXVQpaAlkvShhcDHsCFU5eXENaGkIZWg5nAyJQbVhiWR9BGlUNZwoWYl1dVF0%3D; __jdc=122270672; shshshfp=72ec41b59960ea9a26956307465948f6; rkv=V0700; __jda=122270672.930036140.-.1529979524.1529984840.85; __jdb=122270672.1.930036140|85.1529984840; shshshsID=f797fbad20f4e576e9c30d1c381ecbb1_1_1529984840145'
}
r = requests.get(new_brand_url,headers = head)
r.encoding = 'utf-8'
html1 = etree.HTML(r.text)
datas = html1.xpath('//li[contains(@class,"gl-item")]')
count = 1
goods_href =[]
for data in datas:
p_price = data.xpath('div/div[@class="p-price"]/strong/i/text()')
# p_comment = data.xpath('div/div[5]/strong/a/text()')
p_name = data.xpath('div/div[@class="p-name p-name-type-2"]/a/em')
p_href = data.xpath('div/div[@class="p-name p-name-type-2"]/a/@href')
print(count,[p_name[0].xpath('string(.)'),p_price[0]])
goods_href.append(p_href)
count = count+1
judge = 1
while(judge):
focus_num = input("您关注的商品序号是:")
if focus_num.isdigit():
judge = 0
else:
print("您的输入有误,请输入数字:")
continue
a = int(focus_num)
if a>count-1:
print("输入序号过大,请重新输入:")
judge = 1
elif a<1:
print("输入序号过小,请重新输入:")
judge = 1
focus_good_url = goods_href[int(focus_num)-1]
# print(focus_good_url)
return focus_good_url



def open_Firefox(num):
#location = 'D:/firefox-48.0b9.win64.sdk/firefox-sdk/bin/firefox.exe'
driver = webdriver.Chrome()
driver.get(num)
focus_url = driver.current_url
focus_title = driver.title[:-16]
YesorNo3 = input("是否将此商品加入关注列表?(yes or no)")
if YesorNo3 == 'yes':
print("商品已成功加入关注列表")
with open('JD_goods.csv', 'a', newline="", encoding='utf-8') as f:
write1 = csv.writer(f)
write1.writerow([focus_title])
write1.writerow([focus_url])
write1.writerow(["---------------------------"])


if __name__=='__main__':
judge = 1
while(judge):
YesorNo = input("是否需要打开关注商品信息:(yes or no)")
if YesorNo == 'yes' or YesorNo == 'YES':
with open('JD_goods.csv','r',encoding='utf-8') as cv:
cv_read = cv.read()
print(cv_read)
judge = 0
elif YesorNo == 'no' or YesorNo == 'NO':
judge = 0
else:
print("输入有误,请重新输入:")
goods_name =input("请输入需要查询的商品种类:")
data = getHTMLText(goods_name)
YesorNo2 = input("是否需要根据商品品牌进行排列:(yes or no)")
if YesorNo2 == 'yes':
brand_url = searchGoods(data)
else :
brand_url = searchGoods(data)
new_brand_url = orderBy(brand_url)
focus_good_url = focus_good(new_brand_url)
str1 = str(focus_good_url)
new_url = "https:"+str1[2:-2]
# print(new_url)
open_Firefox(new_url)