Francis'Blog

加油!奋斗!奥里给!


  • Home

  • Categories

  • About

  • Archives

  • Search

jdspider

Posted on 2019-03-07 | In Python爬虫

先占个位置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import requests
from lxml import etree
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
import re

def getHTMLText(goods):
url = 'https://search.jd.com/Search?keyword='+ goods+ '&enc=utf-8'
head={'authority': 'search.jd.com',
'method': 'GET',
'path': '/s_new.php?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=4&s=84&scrolling=y&log_id=1529828108.22071&tpl=3_M&show_items=7651927,7367120,7056868,7419252,6001239,5934182,4554969,3893501,7421462,6577495,26480543553,7345757,4483120,6176077,6932795,7336429,5963066,5283387,25722468892,7425622,4768461',
'scheme': 'https',
'referer': 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=3&s=58&click=0',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'Cookie':'qrsc=3; pinId=RAGa4xMoVrs; xtest=1210.cf6b6759; ipLocation=%u5E7F%u4E1C; _jrda=5; TrackID=1aUdbc9HHS2MdEzabuYEyED1iDJaLWwBAfGBfyIHJZCLWKfWaB_KHKIMX9Vj9_2wUakxuSLAO9AFtB2U0SsAD-mXIh5rIfuDiSHSNhZcsJvg; shshshfpa=17943c91-d534-104f-a035-6e1719740bb6-1525571955; shshshfpb=2f200f7c5265e4af999b95b20d90e6618559f7251020a80ea1aee61500; cn=0; 3AB9D23F7A4B3C9B=QFOFIDQSIC7TZDQ7U4RPNYNFQN7S26SFCQQGTC3YU5UZQJZUBNPEXMX7O3R7SIRBTTJ72AXC4S3IJ46ESBLTNHD37U; ipLoc-djd=19-1607-3638-3638.608841570; __jdu=930036140; user-key=31a7628c-a9b2-44b0-8147-f10a9e597d6f; areaId=19; __jdv=122270672|direct|-|none|-|1529893590075; PCSYCityID=25; mt_xid=V2_52007VwsQU1xaVVoaSClUA2YLEAdbWk5YSk9MQAA0BBZOVQ0ADwNLGlUAZwQXVQpaAlkvShhcDHsCFU5eXENaGkIZWg5nAyJQbVhiWR9BGlUNZwoWYl1dVF0%3D; __jdc=122270672; shshshfp=72ec41b59960ea9a26956307465948f6; rkv=V0700; __jda=122270672.930036140.-.1529979524.1529984840.85; __jdb=122270672.1.930036140|85.1529984840; shshshsID=f797fbad20f4e576e9c30d1c381ecbb1_1_1529984840145'
}
try:
r =requests.get(url,headers = head ,timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""


def searchGoods(brand):
soup = BeautifulSoup(brand,'html.parser')
data1 = soup.find('ul',{"class":"J_valueList v-fixed"})
datali =data1.find_all('li')
Goods_href=[]
Goods_name=[]
for li in datali:
Goods_name.append(li.a.attrs['title'])
print(li.a.attrs['title'])
Goods_href.append(li.a.attrs['href'])
count = 0
for j in range(len(Goods_href)):
print("<<<{}.".format(count+1),"品牌 :"+Goods_name[j])
count = count+1
judge = 1
while(judge):
Goods_num = input("请输入品牌对应序号:")
if Goods_num.isdigit():
judge = 0
else:
print("您的输入有误,请输入数字:")
continue
a = int(Goods_num)
if a>count:
print("输入序号过大,请重新输入:")
judge = 1
elif a<1:
print("输入序号过小,请重新输入:")
judge = 1
print("选择的品牌是: "+Goods_name[int(Goods_num)-1])
brand_url = "https://search.jd.com/"+Goods_href[int(Goods_num)-1]
return brand_url

def orderBy(brand_url):
judge = 1
while(judge):
kind = input("按照:综合 / 销量 / 评论数 / 新品 / 价格 进行排序(默认综合)")
strinfo =re.compile('uc=0#J_searchWrap')#在对网页的url进行分析的时候发现
uc=0#J_searchWrap可以删减,如果点击不同的话对应的知识psort的值不同
if kind == '综合':
judge = 0
return 0
if kind == '销量':
b = strinfo.sub('psort=3',brand_url)
judge = 0
elif kind =='评论数':
b = strinfo.sub('psort=4',brand_url)
judge = 0
elif kind =='新品':
b = strinfo.sub('psort=5',brand_url)
judge = 0
elif kind =='价格':
b = strinfo.sub('psort=2',brand_url)
judge = 0
else :
print("输入有误,请重新输入:")
return b

def focus_good(new_brand_url):
head={'authority': 'search.jd.com',
'method': 'GET',
'path': '/s_new.php?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=4&s=84&scrolling=y&log_id=1529828108.22071&tpl=3_M&show_items=7651927,7367120,7056868,7419252,6001239,5934182,4554969,3893501,7421462,6577495,26480543553,7345757,4483120,6176077,6932795,7336429,5963066,5283387,25722468892,7425622,4768461',
'scheme': 'https',
'referer': 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=3&s=58&click=0',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'Cookie':'qrsc=3; pinId=RAGa4xMoVrs; xtest=1210.cf6b6759; ipLocation=%u5E7F%u4E1C; _jrda=5; TrackID=1aUdbc9HHS2MdEzabuYEyED1iDJaLWwBAfGBfyIHJZCLWKfWaB_KHKIMX9Vj9_2wUakxuSLAO9AFtB2U0SsAD-mXIh5rIfuDiSHSNhZcsJvg; shshshfpa=17943c91-d534-104f-a035-6e1719740bb6-1525571955; shshshfpb=2f200f7c5265e4af999b95b20d90e6618559f7251020a80ea1aee61500; cn=0; 3AB9D23F7A4B3C9B=QFOFIDQSIC7TZDQ7U4RPNYNFQN7S26SFCQQGTC3YU5UZQJZUBNPEXMX7O3R7SIRBTTJ72AXC4S3IJ46ESBLTNHD37U; ipLoc-djd=19-1607-3638-3638.608841570; __jdu=930036140; user-key=31a7628c-a9b2-44b0-8147-f10a9e597d6f; areaId=19; __jdv=122270672|direct|-|none|-|1529893590075; PCSYCityID=25; mt_xid=V2_52007VwsQU1xaVVoaSClUA2YLEAdbWk5YSk9MQAA0BBZOVQ0ADwNLGlUAZwQXVQpaAlkvShhcDHsCFU5eXENaGkIZWg5nAyJQbVhiWR9BGlUNZwoWYl1dVF0%3D; __jdc=122270672; shshshfp=72ec41b59960ea9a26956307465948f6; rkv=V0700; __jda=122270672.930036140.-.1529979524.1529984840.85; __jdb=122270672.1.930036140|85.1529984840; shshshsID=f797fbad20f4e576e9c30d1c381ecbb1_1_1529984840145'
}
r = requests.get(new_brand_url,headers = head)
r.encoding = 'utf-8'
html1 = etree.HTML(r.text)
datas = html1.xpath('//li[contains(@class,"gl-item")]')
count = 1
goods_href =[]
for data in datas:
p_price = data.xpath('div/div[@class="p-price"]/strong/i/text()')
# p_comment = data.xpath('div/div[5]/strong/a/text()')
p_name = data.xpath('div/div[@class="p-name p-name-type-2"]/a/em')
p_href = data.xpath('div/div[@class="p-name p-name-type-2"]/a/@href')
print(count,[p_name[0].xpath('string(.)'),p_price[0]])
goods_href.append(p_href)
count = count+1
judge = 1
while(judge):
focus_num = input("您关注的商品序号是:")
if focus_num.isdigit():
judge = 0
else:
print("您的输入有误,请输入数字:")
continue
a = int(focus_num)
if a>count-1:
print("输入序号过大,请重新输入:")
judge = 1
elif a<1:
print("输入序号过小,请重新输入:")
judge = 1
focus_good_url = goods_href[int(focus_num)-1]
# print(focus_good_url)
return focus_good_url



def open_Firefox(num):
#location = 'D:/firefox-48.0b9.win64.sdk/firefox-sdk/bin/firefox.exe'
driver = webdriver.Chrome()
driver.get(num)
focus_url = driver.current_url
focus_title = driver.title[:-16]
YesorNo3 = input("是否将此商品加入关注列表?(yes or no)")
if YesorNo3 == 'yes':
print("商品已成功加入关注列表")
with open('JD_goods.csv', 'a', newline="", encoding='utf-8') as f:
write1 = csv.writer(f)
write1.writerow([focus_title])
write1.writerow([focus_url])
write1.writerow(["---------------------------"])


if __name__=='__main__':
judge = 1
while(judge):
YesorNo = input("是否需要打开关注商品信息:(yes or no)")
if YesorNo == 'yes' or YesorNo == 'YES':
with open('JD_goods.csv','r',encoding='utf-8') as cv:
cv_read = cv.read()
print(cv_read)
judge = 0
elif YesorNo == 'no' or YesorNo == 'NO':
judge = 0
else:
print("输入有误,请重新输入:")
goods_name =input("请输入需要查询的商品种类:")
data = getHTMLText(goods_name)
YesorNo2 = input("是否需要根据商品品牌进行排列:(yes or no)")
if YesorNo2 == 'yes':
brand_url = searchGoods(data)
else :
brand_url = searchGoods(data)
new_brand_url = orderBy(brand_url)
focus_good_url = focus_good(new_brand_url)
str1 = str(focus_good_url)
new_url = "https:"+str1[2:-2]
# print(new_url)
open_Firefox(new_url)

TensorFlow中的几种交叉熵

Posted on 2019-03-07 | In TensorFlow

TensorFlow中的几种交叉熵

Logit函数

在线性回归中,1,是用直线去拟合数据,实现最小二乘意义下的最小预测误差。

在逻辑回归中:可以看作是用2直线去拟合Logit函数,通过极大似然估计出参数,使得在该参数下,能以最大概率生成当前的样本。

logits函数是一种将取值范围在[0,1]内的概率映射到实数域[-inf,inf]的函数,如果p=0.5,函数值为0;p<0.5,函数值为负;p>0.5,函数值为正。

相对地,softmax和sigmoid则都是将[-inf,inf]映射到[0,1]的函数。

在tensorflow里的”logits”指的其实是,该方法是在logit数值上使用softmax或者sigmoid来进行normalization的,也暗示用户不要将网络输出进行sigmoid或者softmax,这些过程可以在函数内部更高效地计算。

Read more »

Scrapy学习(二)命令行

Posted on 2019-03-05 | In Scrapy

简介

Scrapy是通过Scrapy命令行工具进行控制的,包括创建新的项目,爬虫的启动,相关的设置,Scrapy提供了两种内置的命令,分别是全局命令和项目命令,顾名思义,全局命令就是在任意位置都可以执行的命令,而项目命令只有在你新创建了项目之后,在项目目录中才可以执行的命令。在这里,简单的介绍一些常用的命令。

全局命令

  • startproject

    语法:

1
scrapy startproject <project_name>

这个命令是scrapy最为常用的命令之一,它将会在当前目录下创建一个名为

1
<project_name>

的项目。

  • settings

    语法:

1
scrapy settings [options]

该命令将会输出Scrapy默认设定,当然如果你在项目中运行这个命令将会输出项目的设定值。

  • runspider

    语法:

1
scrapy runspider <spider_file.py>

在未创建项目的情况下,运行一个编写在Python文件中的spider。

  • shell

    语法:

1
scrapy shell [url]

以给定的URL(如果给出)或者空(没有给出URL)启动Scrapy shell。

例如,

1
scrapy shell http://www.baidu.com

将会打开百度URL,

并且启动交互式命令行,可以用来做一些测试。

  • fetch

    语法:

1
scrapy fetch <url>

使用Scrapy下载器(downloader)下载给定的URL,并将获取到的内容送到标准输出。简单的来说,就是打印url的html代码。

  • view

    语法:

1
scrapy view <url>

在你的默认浏览器中打开给定的URL,并以Scrapy spider获取到的形式展现。 有些时候spider获取到的页面和普通用户看到的并不相同,一些动态加载的内容是看不到的, 因此该命令可以用来检查spider所获取到的页面。

  • version

    语法:

1
scrapy version [-v]

输出Scrapy版本。配合 -v 运行时,该命令同时输出Python, Twisted以及平台的信息。

项目命令

  • crawl

    语法:

1
scrapy crawl <spider_name>

使用你项目中的spider进行爬取,即启动你的项目。这个命令将会经常用到,我们会在后面的内容中经常使用。

  • check

    语法:

1
crapy check [-l] <spider>

运行contract检查,检查你项目中的错误之处。

  • list

    语法:

1
scrapy list

列出当前项目中所有可用的spider。每行输出一个spider。

  • genspider

    语法:

1
scrapy genspider [-t template] <name> <domain>

在当前项目中创建spider。该方法可以使用提前定义好的模板来生成spider。您也可以自己创建spider的源码文件。

1…567…12
Francis Cheng

Francis Cheng

很惭愧,就做了一点微小的工作。

35 posts
14 categories
16 tags
© 2019 Francis Cheng
Powered by Hexo
Theme - NexT.Gemini