採集程序

最近項目需要,寫一個採集程序,用於網站數據的填充,學習了一下,寫了個小採集程序,

一共寫了兩個文件1、是鏈接尋址2、採集數據封裝入庫

1.xunzhi.php

ini_set('max_execution_time', '0');
require_once(dirname(__FILE__) . '/../application/initialize.php');
require_once 'index2.php';
$content = file_get_contents("http://****/beijing/districts");
preg_match_all("/<span class=\"district_info\">[^.]{1,300}<\/span>/",$content,$district);
//獲得地區url
foreach($district[0] as $value){


preg_match_all("/\/beijing\/[\w]*\/all/",$value,$districturl);
$districtflag = true;
$page=1;


echo $value."開始採集".$districturl[0][0]."<br/>";
while($districtflag){
$content = file_get_contents("http://****".$districturl[0][0]."/p".$page."");
preg_match_all("/<h1 class=\"name\">[^.]{1,500}<\/h1>/",$content,$restaurant);
$flag = $restaurant[0];
if($flag){


foreach ($restaurant[0] as $value){
    preg_match_all("/restaurant\/[\w]*/",$value,$url);
    $id = caijirestaurant("http://meican.com/".$url[0][0]);
    $condition = array(
        "restaurant" => $id,
        "url" => "http://****/".$url[0][0]
    );
    Restaurant::caiji($condition);
    echo "http://***/".$url[0][0],"<br/>"."採集完畢";
    
    flush();
    ob_flush();
}
echo "第".$page."頁";
$page++;


}else{
  $districtflag = false;
}
}


}
echo "採集完畢hhhwoku";

2、數據封裝入庫

function caijirestaurant($url){


$restaurantid = "";
$content = file_get_contents($url);
//print_r($content);
$data = split("<body id=\"restaurant\" class=\"[\s]*\">",$content);
$dishes = split("<ul class=\"dishes\">",$data[1]);
$dishes = split("<div class=\"right\">",$dishes[1]);
$dishesdata = split("<li class=\"dish[^.]{4,100}is-section\">[^.]{4,250}<\/li>",$dishes[0]);//分類名稱
$categorydata = preg_match_all("/<li class=\"dish[^.]{4,100}is-section\">[^.]{4,250}<\/li>/", $dishes[0], $category);//菜品名稱
$rate = split("<div class=\"restaurant_rate",$data[1]);
preg_match_all("/rate[\S]{1}/", $rate[1], $ratedata);
$ratedata = substr($ratedata[0][0], 4);
$address = explode("<div id=\"content\">", $content);
$addressdata = explode("<ul class=\"restaurant_list not_index\">",$address[1]);
//print_r($addressdata[0]);
$addressdata= split("</span>",$addressdata[0]);
//print_r($addressdata);
$address = strip_tags($addressdata[2]).strip_tags($addressdata[3]);
$address = explode("&rsaquo;", $address);
$address= trim($address[1]).trim($address[2]);
preg_match_all("/<h1 class=\"name\">[\s\S]+<\/h1>/", $content, $restaurant);
$name = strip_tags($restaurant[0][0]);//餐館名稱
preg_match_all("/<div class=\"tel\">[\s\S]{1,200}<\/div>/",$content,$tel);
preg_match_all("/[\d]{1,4}[\s\S]{1,50}[\d]{1,5}/",$tel[0][0],$phone);
$phone = strip_tags($phone[0][0]).strip_tags($phone[0][1]);//電話號碼
preg_match_all("/<input type=\"hidden\" id=\"center_latitude\"[\s\S]+\/>/",$content,$latlng);
preg_match_all("/value=\"[\s\S]{1,20}\"/",$latlng[0][0],$marker);//latlng
$lat = $marker[0][0];
$lng = $marker[0][1];
preg_match_all("/<table class=\"restaurant_info_all\">[\s\S]+<\/table>/",$content,$infor);
preg_match_all("/<tr class=\"restaurant_info_item\">[\s\S]{1,200}<\/tr>/",$infor[0][0],$infornation);
preg_match_all("/<td>[\s\S]+<\/td>/",$infornation[0][1],$time);
$time= trim(strip_tags($time[0][0]));
header("content-type:text/html; charset=utf-8");


$lat = explode("\"", $lat);
$lng =explode("\"",$lng);


$information = split("<\/tr>",$infor[0][0]);
$time= split("<td>",$information[0]);
$timedata = trim(strip_tags($time[1]));
if(strlen($timedata)<2)
    $timedata ="";
$delivery = split("<td>",$information[1]);
$deliverydata = trim(strip_tags($delivery[1]));
if(strlen($deliverydata)<2)
    $deliverydata ="";
$minimum = split("<td>",$information[2]);
$minmumdata = trim(strip_tags($minimum[1]));
if(strlen($minmumdata)<2)
    $minmumdata ="";
$scope = split("<td>",$information[3]);
$scopedata = trim(strip_tags($scope[1]));
if(strlen($scopedata)<2)
    $scopedata ="";
if(!$minmumdata&&!$scopedata&&!$timedata){
    $state = "offline";
}else{
    $state = "online";
}




$condition = array(
    "name" => trim($name),
    "minimum" => trim($minmumdata),
    "scope" => trim($scopedata),
    "deliverycharge" => trim($deliverydata),
    "order_time" => trim($timedata),
    "phone" => trim($phone),
    "address" => trim($address),
    "state" => trim($state),
    "lat" => trim($lat[1]),
    "lng" => trim($lng[1]),
    "star" => trim($ratedata)
);
$restaurant = Restaurant::Register($condition);
$id = $restaurant['id'];
$restaurantid = $id;
$restaurantCategory = array();//餐館類別
$i=0;
foreach($category[0] as $value){
    preg_match("/<span class=\"name\">[\s\S]+<\/span>/", $value, $categoryname);
    $categorydata = strip_tags($categoryname[0]);
    //echo ($categorydata);echo "<br/>";
    $condition = array(
        "name" => $categorydata,
        "restaurant" => $id,
    );
    $categoryid = Restaurant::categoryNew($id, $condition);
    $restaurantCategory[$i] = $categoryid;
    $i++;


}
$i = 0;
foreach($dishesdata as $value){
  $dish = split("<\/li>",$value);
  preg_match_all("<span class=\"name\">",$dish[0], $flagdata);
  $flag = $flagdata[0][0];
  if($flag){
   foreach($dish as $value){
     
       $dishinfo = split("<span class=\"price_outer\">",$value);
       $dishname = trim(strip_tags($dishinfo[0]));
       if($dishname){
       $dishprice = strip_tags($dishinfo[1]);
       $condition = array(
           "restaurant" => $id,
           "category" => $restaurantCategory[$i],
           "name" => trim($dishname),
           "price" => trim($dishprice),
           "state" => "online"
       );
       Restaurant::dishNew($condition);
   }}
   $i++;
   }
}
return $restaurantid;
}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章