csfd-crawler.php
12. 3. 2014 #kód
Example crawler using Matcher and AsyncCurl
use Atrox\Matcher; use Atrox\Curl; use Atrox\Async; $userListMatcher = Matcher::multi('//table[@class="ui-table-list"]//tr', (object) [ 'url' => Matcher::single('td/a/@href')->map(function ($x) { return "http://www.csfd.cz$x"; }), 'points' => Matcher::single('td[3]')->asInt(), 'ratings' => Matcher::single('td[4]')->asInt(), 'comments' => Matcher::single('td[5]')->asInt(), 'films' => Matcher::single('td[10]')->asInt(), ])->fromHtml(); $ratingMatcher = Matcher::multi('//table[@class="ui-table-list"]/tbody//tr', (object) [ 'url' => Matcher::single('td[1]/a/@href')->map(function ($x) { return "http://www.csfd.cz$x"; }), 'rating' => 'td[2]/img/@alt | td[2]/strong', 'date' => 'td[3]' ])->fromHtml(); $curl = Curl::promises()->configure([ CURLOPT_TIMEOUT => 30, CURLOPT_USERAGENT => 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', ]); function fetch($url, $checkFunction = null) { global $curl; return Async::flow(function () use ($url, $checkFunction, $curl) { $try = 1; while (true) { try { $resp = (yield $curl->get($url)); } catch (Exception $e) { continue; } yield $resp; return; } }); } Async::concurrently(100, function () use ($userListMatcher, $ratingMatcher) { foreach (range(1, 3000) as $pageNo) { yield Async::flow(function () use ($pageNo, $userListMatcher, $ratingMatcher) { $userListPage = (yield fetch("http://www.csfd.cz/uzivatele/prehled/strana-$pageNo/")); $userList = $userListMatcher($userListPage->body); foreach ($userList as $user) { $ratings = []; $maxPage = ceil($user->ratings / 100); foreach (range(1, $maxPage) as $ratingPageNo) { $url = $user->url . 'hodnoceni/strana-' . $ratingPageNo; $ratingPage = (yield fetch($url)); $rs = $ratingMatcher($ratingPage->body); $ratings = array_merge($ratings, $rs); } $user->ratings = $ratings; file_put_contents('csfd-users.data', json_encode($user)."\n", LOCK_EX | FILE_APPEND); } }); } }); $curl->loop();
Another version that requires only Atrox\Matcher.
use Atrox\Matcher; $userListMatcher = Matcher::multi('//table[@class="ui-table-list"]//tr', (object) [ 'url' => Matcher::single('td/a/@href')->map(function ($x) { return "https://www.csfd.cz$x"; }), 'points' => Matcher::single('td[3]')->asInt(), 'ratings' => Matcher::single('td[4]')->asInt(), 'comments' => Matcher::single('td[5]')->asInt(), 'films' => Matcher::single('td[10]')->asInt(), ])->fromHtml(); $ratingMatcher = Matcher::multi('//table[@class="ui-table-list"]/tbody//tr', (object) [ 'url' => Matcher::single('td[1]/a/@href')->map(function ($x) { return "https://www.csfd.cz$x"; }), 'rating' => 'td[2]/img/@alt | td[2]/strong', 'date' => 'td[3]' ])->fromHtml(); function makeCurl($url) { $h = curl_init($url); curl_setopt($h, CURLOPT_RETURNTRANSFER, true); curl_setopt($h, CURLOPT_HEADER, 0); curl_setopt($h, CURLOPT_TIMEOUT, 10); curl_setopt($h, CURLOPT_RETURNTRANSFER, true); return $h; } $workItems = []; foreach (range(1, 2000) as $pageNo) { $workItems["https://www.csfd.cz/uzivatele/prehled/strana-$pageNo/"] = function ($userListPage) use (&$workItems, $userListMatcher, $ratingMatcher) { $userList = $userListMatcher($userListPage); foreach ($userList as $user) { $ratings = []; $maxPage = ceil($user->ratings / 100); foreach (range(1, $maxPage) as $ratingPageNo) { $url = $user->url . 'hodnoceni/strana-' . $ratingPageNo. '/'; $workItems[$url] = function ($ratingPage) use ($ratingMatcher, $user, $url) { $ratings = $ratingMatcher($ratingPage); $line = $user->url." ".json_encode($ratings)."\n"; file_put_contents('csfd-users.data', $line, LOCK_EX | FILE_APPEND); }; } } }; } $multi = curl_multi_init(); $inProgress = []; $enqueueWorkItem = function () use (&$workItems, &$inProgress, $multi) { $url = array_rand($workItems); curl_multi_add_handle($multi, makeCurl($url)); $inProgress[$url] = $workItems[$url]; unset($workItems[$url]); }; $connections = 21; for ($i = 0; $i < $connections; $i++) { $enqueueWorkItem(); } $running = null; do { $status = curl_multi_exec($multi, $running); if ($running) { curl_multi_select($multi); } for ($i = $running; $i < $connections && !empty($workItems); $i++) { $enqueueWorkItem(); $running = true; } while (($info = curl_multi_info_read($multi)) !== false) { if ($info['msg'] === CURLMSG_DONE && $info['result'] === CURLE_OK) { $h = $info['handle']; $url = curl_getinfo($h, CURLINFO_EFFECTIVE_URL); $content = curl_multi_getcontent($h); $inProgress[$url]($content); curl_multi_remove_handle($multi, $h); curl_close($h); } } } while ($running && $status == CURLM_OK); curl_multi_close($multi);