僅用于交流和學(xué)習(xí),禁止利用本資源從事任何違反本國(地區(qū))法律法規(guī)的活動(dòng)策严,一切遵守《網(wǎng)絡(luò)安全法》
Tips:只是提供一個(gè)思路庇麦,實(shí)際項(xiàng)目中還需維護(hù)代理池可用性等細(xì)節(jié)部分
實(shí)戰(zhàn)步驟
- 框架及核心庫部署
- 定時(shí)更新代理池進(jìn)程
- 定時(shí)爬取列表頁進(jìn)程
- 主進(jìn)程定時(shí)從Redis中讀取列表頁任務(wù),有則將每一項(xiàng)丟給異步任務(wù)執(zhí)行
環(huán)境
- CentOS 7.2
- PHP7.2
- Swoole 4.3.5
- Google Chrome 78.0.3904.108
- ChromeDriver 78.0.3904.105
Composer
- facebook/webdriver=1.7
- easyswoole/easyswoole=3.1.18
- easyswoole/curl=1.0.1
框架及核心庫部署
1晒衩、安裝EasySwoole 3.1.18版本
[root@ar414.com phpseleniumdemo] composer require easyswoole/easyswoole=3.1.18
[root@ar414.com phpseleniumdemo] php vendor/easyswoole/easyswoole/bin/easyswoole install
______ _____ _
| ____| / ____| | |
| |__ __ _ ___ _ _ | (___ __ __ ___ ___ | | ___
| __| / _` | / __| | | | | \___ \ \ \ /\ / / / _ \ / _ \ | | / _ \
| |____ | (_| | \__ \ | |_| | ____) | \ V V / | (_) | | (_) | | | | __/
|______| \__,_| |___/ \__, | |_____/ \_/\_/ \___/ \___/ |_| \___|
__/ |
|___/
install success,enjoy!
2.安裝核心庫facebook/webdriver嗤瞎、easyswoole/curl
[root@ar414.com phpseleniumdemo]# composer require facebook/webdriver=1.7
[root@ar414.com phpseleniumdemo]# composer require easyswoole/curl=1.0.1
3、確認(rèn)運(yùn)行沒報(bào)錯(cuò)
[root@ar414.com phpseleniumdemo]# php easyswoole start
| ____| / ____| | |
| |__ __ _ ___ _ _ | (___ __ __ ___ ___ | | ___
>| __| / _` | / __| | | | | \___ \ \ \ /\ / / / _ \ / _ \ | | / _ \
>| |____ | (_| | \__ \ | |_| | ____) | \ V V / | (_) | | (_) | | | | __/
>|______| \__,_| |___/ \__, | |_____/ \_/\_/ \___/ \___/ |_| \___|
> __/ |
> |___/
main server SWOOLE_WEB
listen address 0.0.0.0
listen port 9501
sub server1 CONSOLE => SWOOLE_TCP@127.0.0.1:9500
....
定時(shí)更新代理池進(jìn)程
Tips:代理資源請自行解決听系,這里只提供例子贝奇,實(shí)際是用不了的
1、 創(chuàng)建項(xiàng)目主目錄
[root@ar414.com phpseleniumdemo]# mkdir App
#composer 指定App作用域
[root@ar414.com phpseleniumdemo]# cat composer.json
{
"autoload": {
"psr-4": { "App\\": "App/"
}
},
"require": {
"easyswoole/easyswoole": "3.1.18",
"facebook/webdriver": "^1.7",
"easyswoole/curl": "1.0.1"
}
}
#更新composer autoload
[root@ar414.com phpseleniumdemo]# composer dump-autoload
2靠胜、創(chuàng)建進(jìn)程目錄(將代理池更新作為一個(gè)子進(jìn)程隨項(xiàng)目啟動(dòng)運(yùn)行)
[root@ar414.com phpseleniumdemo]# mkdir App/Process
3弃秆、代理池定時(shí)爬冉焱铩(使用Redis List類型保證最新代理IP在頭部,爬蟲邏輯每次從頭部獲取菠赚,一個(gè)代理IP只用一次)
Tips:代理資源請自行解決脑豹,這里只提供例子,實(shí)際是用不了的
<?php
/**
* Created by PhpStorm.
* User: ar414.com@gmail.com
* Date: 2019/12/7
* Time: 21:00
*/
namespace App\Process;
use App\Lib\Curl;
use App\Lib\Kv;
use EasySwoole\Component\Process\AbstractProcess;
class UpdateProxyPool extends AbstractProcess
{
//這里的代理IP都只支持socks5協(xié)議
private $proxyListApi = "http://www.zdopen.com/ShortS5Proxy/GetIP/?api=%s&akey=%s&order=2&type=3";
const PROXY_KV_KEY = 'spider:proxy:list';
const TIMER = 15;
protected function initProxyListApi()
{
// $this->proxyListApi = sprintf($this->proxyListApi,$_ENV['PROXY_LIST_API'],$_ENV['PROXY_LIST_KEY']);
$this->proxyListApi = sprintf($this->proxyListApi,20191231231237085,'72axxxae0fe34');
}
public function run($arg)
{
$this->initProxyListApi();
//依賴 composer require easyswoole/curl=1.0.1
while (true)
{
$ret = Curl::get($this->proxyListApi);
var_dump($ret);
if($ret) {
$ret = json_decode($ret,true);
if($ret['code'] == 10001 && isset($ret['data']['proxy_list']) && !empty($ret['data']['proxy_list']) ) {
foreach($ret['data']['proxy_list'] as $proxy) {
$proxyItem = $proxy['ip'] . ':'.$proxy['port'];
Kv::redis()->lPush(self::PROXY_KV_KEY,$proxyItem);
}
}
}
sleep(self::TIMER);
}
}
}
4衡查、配置代理池更新進(jìn)程隨項(xiàng)目啟動(dòng)時(shí)啟動(dòng)(完整代碼鏈接)
public static function mainServerCreate(EventRegister $register)
{
//更新代理池進(jìn)程
ServerManager::getInstance()->getSwooleServer()->addProcess((new \App\Process\UpdateProxyPool('UpdateProxyPool', []))->getProcess());
}
定時(shí)爬取列表頁進(jìn)程
爬取列表頁進(jìn)程(完整代碼鏈接)
<?php
/**
* Created by PhpStorm.
* User: ar414.com@gmail.com
* Date: 2019/12/7
* Time: 22:01
*/
namespace App\Process;
use App\Lib\ChromeDriver;
use App\Lib\Kv;
use EasySwoole\Component\Process\AbstractProcess;
use EasySwoole\EasySwoole\Logger;
class ListSpider extends AbstractProcess
{
const API = 'https://www.188-sb.com/SportsBook.API/web?lid=1&zid=3&pd=%23AC%23B151%23C1%23D50%23E10%23F163%23&cid=42&ctid=42';
const LIST_KV_KEY = 'spider:list';
const TIMER = 20; //20秒執(zhí)行一次
public function run($arg)
{
while (true)
{
try
{
$driver = (new ChromeDriver(true))->getDriver();
$driver->get(self::API);
$listStr = $driver->getPageSource();
var_dump($listStr);
file_put_contents("/www/wwwroot/blog/phpseleniumdemo/listStr.html",$listStr);
preg_match_all("/PD=(.*);/U",$listStr,$list);
$list = array_unique($list[1]);
if($list)
{
Kv::redis()->set(self::LIST_KV_KEY,json_encode($list));
}
var_dump('done');
$driver->close();
$driver->quit();
}
catch (\Throwable $throwable)
{
$driver->close();
$driver->quit();
Logger::getInstance()->log($throwable->getMessage(),'ListSpiderError');
var_dump($throwable->getMessage());
}
sleep(self::TIMER);
}
}
}
主進(jìn)程定時(shí)從Redis中讀取列表頁任務(wù)瘩欺,有則將每一項(xiàng)丟給異步任務(wù)執(zhí)行
1、完整代碼鏈接
public static function mainServerCreate(EventRegister $register)
{
//更新代理池進(jìn)程
ServerManager::getInstance()->getSwooleServer()->addProcess((new \App\Process\UpdateProxyPool('UpdateProxyPool', []))->getProcess());
//列表爬取進(jìn)程
ServerManager::getInstance()->getSwooleServer()->addProcess((new \App\Process\ListSpider('ListSpider', []))->getProcess());
$register->set($register::onWorkerStart,function(\swoole_server $server,$workerId){
if($workerId == 0)
{
Timer::getInstance()->loop(30000, function () {
$ret = Kv::redis()->get(ListSpider::LIST_KV_KEY);
if($ret){
$ret = json_decode($ret,true);
foreach($ret as $item) {
TaskManager::async(function () use($item){
(new ItemSpider(true))->run($item);
return true;
}, function () use($item){
var_dump("{$item} Done");
});
}
}
});
}
});
}
2拌牲、ItemSpider邏輯代碼(完整代碼鏈接)
<?php
/**
* Created by PhpStorm.
* User: ar414.com@gmail.com
* Date: 2019/12/7
* Time: 22:35
*/
namespace App\Spider;
use App\Lib\ChromeDriver;
use EasySwoole\EasySwoole\Logger;
use Facebook\WebDriver\WebDriverBy;
use Facebook\WebDriver\WebDriverExpectedCondition;
class ItemSpider
{
public function run($itemPath)
{
$driver = (new ChromeDriver(true))->getDriver();
$itemPath = str_replace('#','/',$itemPath);
$url = "https://www.188-sb.com/#{$itemPath}";
var_dump($url);
try
{
$driver->get($url);
$driver->wait(ChromeDriver::WAIT_SECONDS)->until(
WebDriverExpectedCondition::visibilityOfElementLocated(
WebDriverBy::className('gl-MarketGroupButton_Text')
)
);
Logger::getInstance()->console("The title is '" . $driver->getTitle() . "'\n");
Logger::getInstance()->console("The current URI is '" . $driver->getCurrentURL() . "'\n");
$body = $driver->getPageSource();
var_dump($body);
$driver->close();
$driver->quit();
//TODO 清洗數(shù)據(jù) 入庫
}
catch (\Throwable $throwable)
{
Logger::getInstance()->log($throwable->getMessage(),'Bet365ApiRun');
$driver->close();
$driver->quit();
}
return;
}
}
3俱饿、運(yùn)行
[root@ar414.com phpseleniumdemo]# php easyswoole start