これは役に立ちます。
これは(任意 タグの内部HTMLコンテンツとattibute値を抽出するための別 機能と[PHP DOMDocument
とcurl
を使用してPHP配列 にIMDB映画メタ値(例:画像、タイトルや概要)をフェッチ。
<?php
$dom = new DOMDocument;
function disguise_curl($url)
{
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_AUTOREFERER, true);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
$html= curl_exec($curl);
if($html=== false)
{
if($errno = curl_errno($curl)){
$error_message = curl_strerror($errno);
$html= "cURL error ({$errno}): {$error_message}\n";
}
}
curl_close($curl);
return $html;
}
function scrape_between($data, $start, $end){
$data = stristr($data, $start);
$data = substr($data, strlen($start));
$stop = stripos($data, $end);
$data = substr($data, 0, $stop);
return $data;
}
function getHTMLByID($id, $html) {
$dom = new DOMDocument;
libxml_use_internal_errors(true);
$dom->validateOnParse = true;
$dom->loadHTML($html);
$node = $dom->getElementById($id);
if($node) {
return $dom->saveHTML($node);
}
return FALSE;
}
function getHTMLByClass($class, $html, $bring_tag=false){
$dom = new DOMDocument;
libxml_use_internal_errors(true);
$dom->validateOnParse = true;
$dom->loadHTML($html);
$class_arr= array();
$xpath= new DOMXPath($dom);
$results = $xpath->query("//*[contains(@class, '$class')]");
if($results->length > 0){
foreach($results as $tag)
{
if($bring_tag===true)
array_push($class_arr, $tag);
else
array_push($class_arr, $dom->saveHTML($tag));
}
}
return $class_arr;
}
function get_domattr($html, $tag, $attr)
{
$attr_vals= array();
if(!empty($html))
{
$dom = new DOMDocument;
libxml_use_internal_errors(true);
$dom->validateOnParse = true;
$dom->loadHTML($html);
foreach($dom->getElementsByTagName($tag) as $img)
array_push($attr_vals, $img->getAttribute($attr));
}
return $attr_vals;
}
function getHTMLByTag($tag, $html) {
$attr_vals= array();
if(!empty($html))
{
global $dom;
libxml_use_internal_errors(true);
$dom->validateOnParse = true;
$dom->loadHTML($html);
foreach($dom->getElementsByTagName($tag) as $taghtml)
array_push($attr_vals, $dom->saveXML($taghtml));
}
return $attr_vals;
}
$url= "http://www.imdb.com/search/title?genres=action";
$page_html= disguise_curl($url);
$result_html= getHTMLByClass('image', $page_html);
$movie_list= array();
$i=0;
foreach($result_html as $cont_tag)
{
$img_link= get_domattr($cont_tag, 'img', 'src');
if((!isset($img_link)) || (empty($img_link)))
$movie_list[$i]['photo']= 'na';
else
$movie_list[$i]['photo']= $img_link[0];
++$i;
}
$result_html= getHTMLByClass('title', $page_html);
$link_pre= 'http://imdb.com';
$i=0;
foreach($result_html as $cont_tag)
{
$mtitle= getHTMLByTag('a', $cont_tag);
if((!isset($mtitle)) || (empty($mtitle)))
$movie_list[$i]['title']= 'na';
else
$movie_list[$i]['title']= $mtitle[0];
$mlink= get_domattr($cont_tag, 'a', 'href');
if((!isset($mlink)) || (empty($mlink)))
$movie_list[$i]['link']= 'na';
else
$movie_list[$i]['link']= $link_pre.''.$mlink[0];
$moutline= getHTMLByClass('outline', $cont_tag);
if((!isset($moutline)) || (empty($moutline)))
$movie_list[$i]['outline']= 'na';
else
$movie_list[$i]['outline']= $moutline[0];
++$i;
}
echo '<pre>';
print_r($movie_list);
echo '</pre>';
?>
サンプル出力:
Array
(
[0] => Array
(
[photo] => http://ia.media-imdb.com/images/M/[email protected]_V1._SX54_CR0,0,54,74_.jpg
[title] => Captain America: Civil War
[link] => http://imdb.com/title/tt3498820/
[outline] => Political interference in the Avengers' activities causes a rift between former allies Captain America and Iron Man.
)
[1] => Array
(
[photo] => http://ia.media-imdb.com/images/M/[email protected]_V1._SX54_CR0,0,54,74_.jpg
[title] => Batman v Superman: Dawn of Justice
[link] => http://imdb.com/title/tt2975590/
[outline] => Fearing the actions of Superman are left unchecked, Batman takes on the man of steel, while the world wrestles with what kind of a hero it really needs. With Batman and Superman fighting each other, a new threat, Doomsday, is created by Lex Luthor. It's up to Superman and Batman to set aside their differences along with Wonder Woman to stop Lex Luthor and Doomsday from destroying Metropolis.
)
[2] => Array
(
[photo] => http://ia.media-imdb.com/images/M/[email protected]_V1._SX54_CR0,0,54,74_.jpg
[title] => na
[link] => na
[outline] => na
)
)
マッチングID、タグ名およびクラス)]によって
[RegExはXHTML自己完結型タグを除くオープンタグと一致する可能性があります](http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags) – andrew
、ちょうどDOMを使用してください! – Ikari
サイトを削る代わりに、ここから始めてください:http://www.imdb.com/licensing/ //この質問はこの点でも興味深いかもしれません。http://stackoverflow.com/questions/1966503/does -imdb-provide-an-api – CBroe