-
Notifications
You must be signed in to change notification settings - Fork 1
/
Spider.php
141 lines (125 loc) · 4.5 KB
/
Spider.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
<?php
namespace ins2me;
require_once dirname(__FILE__) . '/config.php';
class Spider
{
//本次下载的图片
public static $imgFileNames = [];
//图片保存路径
private static $path = './img/';
/** 获取图片url **/
public function imgSpider($url)
{
$ch = curl_init();
self::curlSetOpt($ch, $url);
$result = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$error = curl_error($ch);
curl_close($ch);
if ($httpCode == 200 && $result) {
return $result;
} else {
return $this->imgSpider($url);
}
}
public function getImgUrl($url)
{
$htmlContent = self::imgSpider($url);
if ($htmlContent) {
/*利用正则表达式得到图片链接*/
$reg_tag = '/display_url\":\"(.*?)\"/';
$ret = preg_match_all($reg_tag, $htmlContent, $match_result);
unset($htmlContent);
if (!empty($match_result[1])) {
$imgUrls = array_unique($match_result[1]);
return $imgUrls;
}
return false;
}
}
private function curlSetOpt(&$ch, $httpUrl)
{
curl_setopt($ch, CURLOPT_PROXY, PROXY_IP); //代理服务器地址
curl_setopt($ch, CURLOPT_PROXYPORT, PROXY_PORT); //代理服务器端口
curl_setopt($ch, CURLOPT_PROXYUSERPWD, PROXY_USER.':'.PROXY_PWD); //http代理认证帐号
curl_setopt($ch, CURLOPT_PROXYTYPE, 7); //使用http代理模式
curl_setopt($ch, CURLOPT_URL, $httpUrl);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 0);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
}
/** 下载图片到img目录 **/
public function downloadImage($imgUrls)
{
$mh = curl_multi_init();
$urlHandlers = [];
$urlData = [];
$urls = [];
// 初始化多个请求句柄为一个
foreach ($imgUrls as $imgUrl) {
$ch = curl_init();
self::curlSetOpt($ch, $imgUrl);
$urlHandlers[] = $ch;
$urls[] = $imgUrl;
curl_multi_add_handle($mh, $ch);
}
$active = null;
// 检测操作的初始状态是否OK,CURLM_CALL_MULTI_PERFORM为常量值-1
do {
// 返回的$active是活跃连接的数量,$mrc是返回值,正常为0,异常为-1
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
// 如果还有活动的请求,同时操作状态OK,CURLM_OK为常量值0
while ($active && $mrc == CURLM_OK) {
// 持续查询状态并不利于处理任务,每50ms检查一次,此时释放CPU,降低机器负载
usleep(50);
if (curl_multi_select($mh) != -1) {
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
//获取返回结果
foreach ($urlHandlers as $index => $ch) {
$urlData[$index] = curl_multi_getcontent($ch);
//移除单个curl句柄
curl_multi_remove_handle($mh, $ch);
}
curl_multi_close($mh);
foreach ($urlData as $index => $data) {
$this->saveAsImage($urls[$index], $data);
}
}
public function saveAsImage($url, $file)
{
$now = date('Y/m/d', time());
$dir = self::$path . $now;
if (is_dir($dir) !== true) {
mkdir($dir, 0777, true);
chmod($dir, 0777);
}
$path = $dir . '/';
$filename = strchr(pathinfo($url, PATHINFO_BASENAME), '?', true);
if (file_exists($path . $filename) === false) {
$resource = fopen($path . $filename, 'a');
$res = fwrite($resource, $file);
if (false !== $res) {
//记录本次下载的图片
array_push(self::$imgFileNames, $path . $filename);
}
} else {
array_push(self::$imgFileNames, $path . $filename);
}
}
/* public function resolveImgFile($images)
{
foreach ($images as $image) {
self::downloadImage($image);
}
// return self::$imgFileNames;
}*/
}