jdspider Posted on 2019-03-07 | In Python爬虫 先占个位置123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172import requestsfrom lxml import etreeimport csvfrom bs4 import BeautifulSoupfrom selenium import webdriverimport redef getHTMLText(goods): url = 'https://search.jd.com/Search?keyword='+ goods+ '&enc=utf-8' head={'authority': 'search.jd.com', 'method': 'GET', 'path': '/s_new.php?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=4&s=84&scrolling=y&log_id=1529828108.22071&tpl=3_M&show_items=7651927,7367120,7056868,7419252,6001239,5934182,4554969,3893501,7421462,6577495,26480543553,7345757,4483120,6176077,6932795,7336429,5963066,5283387,25722468892,7425622,4768461', 'scheme': 'https', 'referer': 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=3&s=58&click=0', 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36', 'x-requested-with': 'XMLHttpRequest', 'Cookie':'qrsc=3; pinId=RAGa4xMoVrs; xtest=1210.cf6b6759; ipLocation=%u5E7F%u4E1C; _jrda=5; TrackID=1aUdbc9HHS2MdEzabuYEyED1iDJaLWwBAfGBfyIHJZCLWKfWaB_KHKIMX9Vj9_2wUakxuSLAO9AFtB2U0SsAD-mXIh5rIfuDiSHSNhZcsJvg; shshshfpa=17943c91-d534-104f-a035-6e1719740bb6-1525571955; shshshfpb=2f200f7c5265e4af999b95b20d90e6618559f7251020a80ea1aee61500; cn=0; 3AB9D23F7A4B3C9B=QFOFIDQSIC7TZDQ7U4RPNYNFQN7S26SFCQQGTC3YU5UZQJZUBNPEXMX7O3R7SIRBTTJ72AXC4S3IJ46ESBLTNHD37U; ipLoc-djd=19-1607-3638-3638.608841570; __jdu=930036140; user-key=31a7628c-a9b2-44b0-8147-f10a9e597d6f; areaId=19; __jdv=122270672|direct|-|none|-|1529893590075; PCSYCityID=25; mt_xid=V2_52007VwsQU1xaVVoaSClUA2YLEAdbWk5YSk9MQAA0BBZOVQ0ADwNLGlUAZwQXVQpaAlkvShhcDHsCFU5eXENaGkIZWg5nAyJQbVhiWR9BGlUNZwoWYl1dVF0%3D; __jdc=122270672; shshshfp=72ec41b59960ea9a26956307465948f6; rkv=V0700; __jda=122270672.930036140.-.1529979524.1529984840.85; __jdb=122270672.1.930036140|85.1529984840; shshshsID=f797fbad20f4e576e9c30d1c381ecbb1_1_1529984840145' } try: r =requests.get(url,headers = head ,timeout = 30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return ""def searchGoods(brand): soup = BeautifulSoup(brand,'html.parser') data1 = soup.find('ul',{"class":"J_valueList v-fixed"}) datali =data1.find_all('li') Goods_href=[] Goods_name=[] for li in datali: Goods_name.append(li.a.attrs['title']) print(li.a.attrs['title']) Goods_href.append(li.a.attrs['href']) count = 0 for j in range(len(Goods_href)): print("<<<{}.".format(count+1),"品牌 :"+Goods_name[j]) count = count+1 judge = 1 while(judge): Goods_num = input("请输入品牌对应序号:") if Goods_num.isdigit(): judge = 0 else: print("您的输入有误,请输入数字:") continue a = int(Goods_num) if a>count: print("输入序号过大,请重新输入:") judge = 1 elif a<1: print("输入序号过小,请重新输入:") judge = 1 print("选择的品牌是: "+Goods_name[int(Goods_num)-1]) brand_url = "https://search.jd.com/"+Goods_href[int(Goods_num)-1] return brand_urldef orderBy(brand_url): judge = 1 while(judge): kind = input("按照:综合 / 销量 / 评论数 / 新品 / 价格 进行排序(默认综合)") strinfo =re.compile('uc=0#J_searchWrap')#在对网页的url进行分析的时候发现 uc=0#J_searchWrap可以删减,如果点击不同的话对应的知识psort的值不同 if kind == '综合': judge = 0 return 0 if kind == '销量': b = strinfo.sub('psort=3',brand_url) judge = 0 elif kind =='评论数': b = strinfo.sub('psort=4',brand_url) judge = 0 elif kind =='新品': b = strinfo.sub('psort=5',brand_url) judge = 0 elif kind =='价格': b = strinfo.sub('psort=2',brand_url) judge = 0 else : print("输入有误,请重新输入:") return bdef focus_good(new_brand_url): head={'authority': 'search.jd.com', 'method': 'GET', 'path': '/s_new.php?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=4&s=84&scrolling=y&log_id=1529828108.22071&tpl=3_M&show_items=7651927,7367120,7056868,7419252,6001239,5934182,4554969,3893501,7421462,6577495,26480543553,7345757,4483120,6176077,6932795,7336429,5963066,5283387,25722468892,7425622,4768461', 'scheme': 'https', 'referer': 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=3&s=58&click=0', 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36', 'x-requested-with': 'XMLHttpRequest', 'Cookie':'qrsc=3; pinId=RAGa4xMoVrs; xtest=1210.cf6b6759; ipLocation=%u5E7F%u4E1C; _jrda=5; TrackID=1aUdbc9HHS2MdEzabuYEyED1iDJaLWwBAfGBfyIHJZCLWKfWaB_KHKIMX9Vj9_2wUakxuSLAO9AFtB2U0SsAD-mXIh5rIfuDiSHSNhZcsJvg; shshshfpa=17943c91-d534-104f-a035-6e1719740bb6-1525571955; shshshfpb=2f200f7c5265e4af999b95b20d90e6618559f7251020a80ea1aee61500; cn=0; 3AB9D23F7A4B3C9B=QFOFIDQSIC7TZDQ7U4RPNYNFQN7S26SFCQQGTC3YU5UZQJZUBNPEXMX7O3R7SIRBTTJ72AXC4S3IJ46ESBLTNHD37U; ipLoc-djd=19-1607-3638-3638.608841570; __jdu=930036140; user-key=31a7628c-a9b2-44b0-8147-f10a9e597d6f; areaId=19; __jdv=122270672|direct|-|none|-|1529893590075; PCSYCityID=25; mt_xid=V2_52007VwsQU1xaVVoaSClUA2YLEAdbWk5YSk9MQAA0BBZOVQ0ADwNLGlUAZwQXVQpaAlkvShhcDHsCFU5eXENaGkIZWg5nAyJQbVhiWR9BGlUNZwoWYl1dVF0%3D; __jdc=122270672; shshshfp=72ec41b59960ea9a26956307465948f6; rkv=V0700; __jda=122270672.930036140.-.1529979524.1529984840.85; __jdb=122270672.1.930036140|85.1529984840; shshshsID=f797fbad20f4e576e9c30d1c381ecbb1_1_1529984840145' } r = requests.get(new_brand_url,headers = head) r.encoding = 'utf-8' html1 = etree.HTML(r.text) datas = html1.xpath('//li[contains(@class,"gl-item")]') count = 1 goods_href =[] for data in datas: p_price = data.xpath('div/div[@class="p-price"]/strong/i/text()') # p_comment = data.xpath('div/div[5]/strong/a/text()') p_name = data.xpath('div/div[@class="p-name p-name-type-2"]/a/em') p_href = data.xpath('div/div[@class="p-name p-name-type-2"]/a/@href') print(count,[p_name[0].xpath('string(.)'),p_price[0]]) goods_href.append(p_href) count = count+1 judge = 1 while(judge): focus_num = input("您关注的商品序号是:") if focus_num.isdigit(): judge = 0 else: print("您的输入有误,请输入数字:") continue a = int(focus_num) if a>count-1: print("输入序号过大,请重新输入:") judge = 1 elif a<1: print("输入序号过小,请重新输入:") judge = 1 focus_good_url = goods_href[int(focus_num)-1] # print(focus_good_url) return focus_good_urldef open_Firefox(num): #location = 'D:/firefox-48.0b9.win64.sdk/firefox-sdk/bin/firefox.exe' driver = webdriver.Chrome() driver.get(num) focus_url = driver.current_url focus_title = driver.title[:-16] YesorNo3 = input("是否将此商品加入关注列表?(yes or no)") if YesorNo3 == 'yes': print("商品已成功加入关注列表") with open('JD_goods.csv', 'a', newline="", encoding='utf-8') as f: write1 = csv.writer(f) write1.writerow([focus_title]) write1.writerow([focus_url]) write1.writerow(["---------------------------"])if __name__=='__main__': judge = 1 while(judge): YesorNo = input("是否需要打开关注商品信息:(yes or no)") if YesorNo == 'yes' or YesorNo == 'YES': with open('JD_goods.csv','r',encoding='utf-8') as cv: cv_read = cv.read() print(cv_read) judge = 0 elif YesorNo == 'no' or YesorNo == 'NO': judge = 0 else: print("输入有误,请重新输入:") goods_name =input("请输入需要查询的商品种类:") data = getHTMLText(goods_name) YesorNo2 = input("是否需要根据商品品牌进行排列:(yes or no)") if YesorNo2 == 'yes': brand_url = searchGoods(data) else : brand_url = searchGoods(data) new_brand_url = orderBy(brand_url) focus_good_url = focus_good(new_brand_url) str1 = str(focus_good_url) new_url = "https:"+str1[2:-2] # print(new_url) open_Firefox(new_url)