0xDEADBEEF (⇉english edition⇉)

[RSS] [odkazy]

csfd-crawler.php

12. 3. 2014 (před 6 lety)

Exam­ple crawler using Matcher and AsyncCurl

use Atrox\Matcher;
use Atrox\Curl;
use Atrox\Async;

$userListMatcher = Matcher::multi('//table[@class="ui-table-list"]//tr', (object) [
  'url'      => Matcher::single('td/a/@href')->map(function ($x) { return "http://www.csfd.cz$x"; }),
  'points'   => Matcher::single('td[3]')->asInt(),
  'ratings'  => Matcher::single('td[4]')->asInt(),
  'comments' => Matcher::single('td[5]')->asInt(),
  'films'    => Matcher::single('td[10]')->asInt(),
])->fromHtml();

$ratingMatcher = Matcher::multi('//table[@class="ui-table-list"]/tbody//tr', (object) [
  'url'    => Matcher::single('td[1]/a/@href')->map(function ($x) { return "http://www.csfd.cz$x"; }),
  'rating' => 'td[2]/img/@alt | td[2]/strong',
  'date'   => 'td[3]'
])->fromHtml();


$curl = Curl::promises()->configure([
  CURLOPT_TIMEOUT        => 30,
  CURLOPT_USERAGENT      => 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
]);


function fetch($url, $checkFunction = null) {
  global $curl;
  return Async::flow(function () use ($url, $checkFunction, $curl) {
    $try = 1;
    while (true) {
      try {
        $resp = (yield $curl->get($url));
      } catch (Exception $e) {
        continue;
      }

      yield $resp;
      return;
    }
  });
}

Async::concurrently(100, function () use ($userListMatcher, $ratingMatcher) {

  foreach (range(1, 3000) as $pageNo) {

    yield Async::flow(function () use ($pageNo, $userListMatcher, $ratingMatcher) {

      $userListPage = (yield fetch("http://www.csfd.cz/uzivatele/prehled/strana-$pageNo/"));
      $userList = $userListMatcher($userListPage->body);

      foreach ($userList as $user) {
        $ratings = [];
        $maxPage = ceil($user->ratings / 100);
        foreach (range(1, $maxPage) as $ratingPageNo) {
          $url = $user->url . 'hodnoceni/strana-' . $ratingPageNo;
          $ratingPage = (yield fetch($url));
          $rs = $ratingMatcher($ratingPage->body);
          $ratings = array_merge($ratings, $rs);
        }
        $user->ratings = $ratings;

        file_put_contents('csfd-users.data', json_encode($user)."\n", LOCK_EX | FILE_APPEND);
      }

    });

  }

});

$curl->loop();

Ano­ther ver­sion that requi­res only Atrox\Matcher.

use Atrox\Matcher;


$userListMatcher = Matcher::multi('//table[@class="ui-table-list"]//tr', (object) [
  'url'      => Matcher::single('td/a/@href')->map(function ($x) { return "https://www.csfd.cz$x"; }),
  'points'   => Matcher::single('td[3]')->asInt(),
  'ratings'  => Matcher::single('td[4]')->asInt(),
  'comments' => Matcher::single('td[5]')->asInt(),
  'films'    => Matcher::single('td[10]')->asInt(),
])->fromHtml();

$ratingMatcher = Matcher::multi('//table[@class="ui-table-list"]/tbody//tr', (object) [
  'url'    => Matcher::single('td[1]/a/@href')->map(function ($x) { return "https://www.csfd.cz$x"; }),
  'rating' => 'td[2]/img/@alt | td[2]/strong',
  'date'   => 'td[3]'
])->fromHtml();


function makeCurl($url) {
  $h = curl_init($url);
  curl_setopt($h, CURLOPT_RETURNTRANSFER, true);
  curl_setopt($h, CURLOPT_HEADER, 0);
  curl_setopt($h, CURLOPT_TIMEOUT, 10);
  curl_setopt($h, CURLOPT_RETURNTRANSFER, true);
  return $h;
}


$workItems = [];

foreach (range(1, 2000) as $pageNo) {
  $workItems["https://www.csfd.cz/uzivatele/prehled/strana-$pageNo/"] = function ($userListPage) use (&$workItems, $userListMatcher, $ratingMatcher) {
    $userList = $userListMatcher($userListPage);

    foreach ($userList as $user) {
      $ratings = [];
      $maxPage = ceil($user->ratings / 100);
      foreach (range(1, $maxPage) as $ratingPageNo) {
        $url = $user->url . 'hodnoceni/strana-' . $ratingPageNo. '/';

        $workItems[$url] = function ($ratingPage) use ($ratingMatcher, $user, $url) {
          $ratings = $ratingMatcher($ratingPage);
          $line = $user->url." ".json_encode($ratings)."\n";
          file_put_contents('csfd-users.data', $line, LOCK_EX | FILE_APPEND);
        };
      }
    }
  };
}


$multi = curl_multi_init();
$inProgress = [];

$enqueueWorkItem = function () use (&$workItems, &$inProgress, $multi) {
  $url = array_rand($workItems);
  curl_multi_add_handle($multi, makeCurl($url));
  $inProgress[$url] = $workItems[$url];
  unset($workItems[$url]);
};


$connections = 21;

for ($i = 0; $i < $connections; $i++) {
  $enqueueWorkItem();
}


$running = null;
do {
  $status = curl_multi_exec($multi, $running);
  if ($running) {
    curl_multi_select($multi);
  }

  for ($i = $running; $i < $connections && !empty($workItems); $i++) {
    $enqueueWorkItem();
    $running = true;
  }

  while (($info = curl_multi_info_read($multi)) !== false) {
    if ($info['msg'] === CURLMSG_DONE && $info['result'] === CURLE_OK) {
      $h = $info['handle'];

      $url = curl_getinfo($h, CURLINFO_EFFECTIVE_URL);
      $content = curl_multi_getcontent($h);

      $inProgress[$url]($content);

      curl_multi_remove_handle($multi, $h);
      curl_close($h);
    }
  }
} while ($running && $status == CURLM_OK);

curl_multi_close($multi);
píše k47 (@kaja47, k47)