-
Notifications
You must be signed in to change notification settings - Fork 1
/
netspider_jiayuan.py
234 lines (223 loc) · 9 KB
/
netspider_jiayuan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#version 0.1
#get code on https://github.com/bluedazzle/python_network
import urllib.request
import optparse
import http.cookiejar
import urllib.parse
import time
import re
import os
isLogin = False
isNetCon = True
isPhotoExist = False
userurl = 100000000
gettimes = 0
failtimes = 0
photo_url = ''
photooutpath = ''
failArr = ['失败列表:']
loglists = ['']
def createLog(loglist):
global failArr
try:
with open('jiayuan_spider.log','a+') as file1:
file1.write(str(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))))
for it in loglist:
file1.write(it)
file1.write('\n')
for it in failArr:
file1.write(it)
file1.write('\n')
file1.write('\n\n\n')
file1.close()
print('日志保存成功...')
except:
print('日志保存失败')
def loginJiaYuan():
global isLogin
global loglists
post_headers = {'Referer' : 'http://login.jiayuan.com/err.php?err_type=2&pre_url=http://www.jiayuan.com/usercp',
'Host': 'login.jiayuan.com',
'Accept' : 'text/html, application/xhtml+xml, */*',
'Content-Type' : 'application/x-www-form-urlencoded',
'Cookie' : 'registeruid=104463413; main_search:104463413=%7C%7C%7C00; REG_REF_URL=; poplogincount=1',
'User-Agent' : 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'}
print('登陆世纪佳缘中...')
try :
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
urllib.request.install_opener(opener)
#"[step1] to get cookie "
MainUrl = "http://login.jiayuan.com/dologin.php?pre_url=http://www.jiayuan.com/usercp"
resp = urllib.request.urlopen(MainUrl)
#emulate login jiayuan
MainLoginUrl = "http://login.jiayuan.com/dologin.php?pre_url=http://www.jiayuan.com/usercp"
postDict = {
'channel' : '0',
'name' : 'p2p_test@yeah.net',
'password' : '123456qq',
'ijg_login' : '1',
'position' :'0'}
postData = urllib.parse.urlencode(postDict).encode()
#print ("postData=",postData)
req = urllib.request.Request(MainLoginUrl, postData)
for key,itm in post_headers.items():
req.add_header(key,itm)
#print('headers has already added.')
html = urllib.request.urlopen(req)
#print('html has getted')
#print(type(cj))
#for index, cookie in enumerate(cj):
# print( '[',index, ']',cookie)
for cookie in cj:
if(cookie.name == 'RAW_HASH'):
isLogin = True
time.sleep(0.2)
except:
print('连接失败,请检查网络连接')
def requestData(url):
'''请求数据模块'''
global isNetCon
global loglists
try:
data = ''
req = urllib.request.Request(url)
req.add_header('Content-Type' ,'application/x-www-form-urlencoded')
#html = urllib.request.urlretrieve(url,'jiayuantest.txt')
reqs = urllib.request.urlopen(req)
data = reqs.read().decode('utf-8')
except urllib.error.URLError as err:
print('网络未连接!')
loglists.append('网络未连接!')
isNetCon = False
return data
print('得到服务器数据')
return data
def formatUserData(data):
'''用户数据提取模块'''
global userurl
global failtimes
global gettimes
global failtimes
global failArr
global photo_url
global photooutpath
global isPhotoExist
try:
#user_photo
photo_urlarr = re.findall('''<img\s*?src=["](\S*?)["]/></a>''',data)
if(photo_urlarr):
print('获取到照片!')
isPhotoExist = True
try:
os.mkdir('userphoto')
except FileExistsError as err:
print(end = '')
finally:
photooutpath = 'userphoto' +'\\'+ str(userurl) + '.jpg'
photo_url = photo_urlarr[0]
urllib.request.urlretrieve(photo_url,photooutpath)
else:
print('此用户无照片')
#user_name user_ID
res = re.findall('''<title>[\u4e00-\u9fa5]+[_]{1}([\u4e00-\u9fa5]+)''',data)
res.append(re.findall('''<title>[\u4e00-\u9fa5]+[_]{1}[\u4e00-\u9fa5]+\S*?(\d{9})''',data)[0])
#sex
res.append(re.findall('''查看详细信息>\S*?</a>([\u4e00-\u9fa5]+)''',data)[0])
#age
res.append(re.findall('''查看详细信息>\S*?</a>[\u4e00-\u9fa5]+\S*?(\d{1,3}[\u4e00-\u9fa5]+)''',data)[0])
#constellation
res.append(re.findall('''查看详细信息>\S*?</a>[\u4e00-\u9fa5]+\S*?\d{1,3}[\u4e00-\u9fa5]+\S*?([\u4e00-\u9fa5]+)''',data)[0])
#from
res.append(re.findall('''查看详细信息>\S*?</a>\S*?([\u4e00-\u9fa5]+)</h2>''',data)[0])
#height
res.append(re.findall('''<span class *= *['"]*\S+["']><b>([\u4e00-\u9fa5]+)[:]{1}</b>(\d{1,3}[\u4e00-\u9fa5]+)''',data)[0])
#education back&job
res.append(re.findall('''<span class *= *['"]*\S+160["']><b>([\u4e00-\u9fa5]+)[:]{1}</b>(\S*?)</span>''',data)[0])
res.append(re.findall('''<span class *= *['"]*\S+160["']><b>([\u4e00-\u9fa5]+)[:]{1}</b>(\S*?)</span>''',data)[1])
#marriage nation
res.append(re.findall('''<span class *= *['"]*\S+100["']><b>([\u4e00-\u9fa5]+)[:]{1}</b>([\u4e00-\u9fa5]+)''',data)[0])
res.append(re.findall('''<span class *= *['"]*\S+100["']><b>([\u4e00-\u9fa5]+)[:]{1}</b>([\u4e00-\u9fa5]+)''',data)[1])
res.append(re.findall('''<span class *= *['"]*\S+100["']><b>([\u4e00-\u9fa5]+)[:]{1}</b>([\u4e00-\u9fa5]+)''',data)[2])
#res.append(re.findall('''<span class *= *['"]*\S+100["']><b>([\u4e00-\u9fa5]+)[:]{1}</b>([\u4e00-\u9fa5]+)''',data)[3])
#salary
res.append(re.findall('''<span class *= *['"]*\S+140["']><b>([\u4e00-\u9fa5]+)[:]{1}</b>(\S*?)</span>''',data)[0])
res.append(re.findall('''<span class *= *['"]*\S+140["']><b>([\u4e00-\u9fa5]+)[:]{1}</b>(\S*?)</span>''',data)[1])
#class=["]\S*?["]>\s*?<p>(.*?)</p> xuanyan
res.append(re.findall('''class=["]internal_monolog_content["]>\s*?<p>(.*?)</p>''',data)[0])
except:
print('数据查询失败!')
print('============================================')
failArr.append(url)
failtimes = failtimes +1
res.clear()
finally:
userurl = userurl + 1
gettimes = gettimes +1
return res
def outputData(url,result,outfilename = 'userdata'):
'''数据输出模块'''
outfile = outfilename + '.csv'
try:
with open(outfile,'a+') as file0:
file0.write(str(gettimes))
file0.write(',')
file0.write(url)
file0.write(',')
if(isPhotoExist):
file0.write(photooutpath)
else:
file0.write('用户无照片')
file0.write(',')
for itm in result:
if(str(type(itm))=='''<class 'tuple'>'''):
for item in itm:
file0.write(item)
else:
file0.write(itm)
file0.write(',')
file0.write('\n')
print('数据写入成功!')
print('============================================')
file0.close()
except UnicodeEncodeError as err:
print('网页编码变化,数据写入失败')
file0.close()
dataNum = int(input('请输入抓取数据量:'))
outputFileName = input('请输入输出文件名(默认为userData)')
loginJiaYuan()
if(isLogin):
print('登陆成功!')
loglists.append('登陆成功!')
if(not(outputFileName)):
outputFileName = 'userData'
outtmp = '数据输出文件:' + str(outputFileName) + '.csv'
loglists.append(outtmp)
while(gettimes<(dataNum)):
url = 'http://www.jiayuan.com/' + str(userurl)
print('第',(gettimes+1),'次查询:')
#print('查询网址:',url)
htmdata = requestData(url)
restmp = formatUserData(htmdata)
if(restmp):
outputData(url,restmp,outputFileName)
if(not(isNetCon)):
break
isPhotoExist = False
time.sleep(0.2)
else:
print('登陆失败!')
loglists.append('登陆失败!')
print('===========================================')
print('===========================================')
if(isNetCon):
tmp = '数据抓取完成,共进行:'+ str(gettimes) + '次数据抓取,失败:' + str(failtimes) + '次'
print(tmp)
loglists.append(tmp)
else:
print('数据抓取失败,请检查网络连接。')
loglists.append('数据抓取失败,请检查网络连接。')
for item in failArr:
print(item)
createLog(loglists)
tm = input('按任意键退出')