Rabu, 23 Agustus 2017

Membuat Web Crawler Sederhana By RieqyNS13

Tags


Kang Karding - Membuat Web Crawler Sederhana By RieqyNS13

Kita bahas langsung satu persatu gan

1. Penentu url(alamat web) tujuan.
Bagian ini akan mengambil 1 url dari database yang belum diproses.

function db_get_url()
{
 $sql = "SELECT id, url FROM tbl_url WHERE status='0' ORDER BY id LIMIT 1";
 $rs = mysql_query($sql);
 $url = '';
 if ($data = mysql_fetch_array($rs))
 {
  $url = $data['url'];

  $sql = "UPDATE tbl_url SET status='1' WHERE id='". $data['id']. "' ";
  $rs = mysql_query($sql);
 }
 if ($url == '') $url = 'http://planet.terasi.net';

 return $url;
}

3. Pemarsing (pemroses) hasil downloadan Fungsi parseHTML akan menerima string html kemudian mengekstrak semua link yang ada di string tersebut. Link tadi akan diambil domainnya saja untuk kemudian disimpan ke dalam database.

function parseHTML($html)
{
 if (preg_match_all("/<a href=\"(.*?)\"/i", $html, $match)) {
    foreach ($match[1] as $row) {
     $domain = getDomain($row);
   if ($domain != '')
   {
    db_insert_url($domain);
   }
    }
 }
}

Berikut ini adalah source code lengkapnya.

<?php

$db = mysql_connect('localhost', 'phpkita', 'phpkita');
mysql_select_db('db_phpkita', $db);

//loop terus aja
while (true)
{
 $url = db_get_url();
 $html = getURL($url);
 db_update_html($url, $html);
 parseHTML($html);
}

mysql_close($db);
exit;

/*
 * fungsi-fungsi
 */
function db_get_url()
{
 $sql = "SELECT id, url FROM tbl_url WHERE status='0' ORDER BY id LIMIT 1";
 $rs = mysql_query($sql);
 $url = '';
 if ($data = mysql_fetch_array($rs))
 {
  $url = $data['url'];

  $sql = "UPDATE tbl_url SET status='1' WHERE id='". $data['id']. "' ";
  $rs = mysql_query($sql);
 }
 if ($url == '') $url = 'http://planet.terasi.net';

 return $url;
}

function getURL($url, $delay=0) {
 $result = "";
 $url = trim($url);
 $delay = intval($delay);
 if ($url != "") {
     $ch     = curl_init();
     curl_setopt($ch, CURLOPT_URL, $url);
     curl_setopt($ch, CURLOPT_HEADER, 0);
     curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
     curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
     curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
     if ($delay != 0) {
         curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 60);
         curl_setopt($ch, CURLOPT_TIMEOUT, $delay);
     }
     else {
         curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 60);
         curl_setopt($ch, CURLOPT_TIMEOUT, 60);
     }
     $result = curl_exec($ch);
     curl_close($ch);
 }

 return($result);
}

function parseHTML($html)
{
 if (preg_match_all("/<a href=\"(.*?)\"/i", $html, $match)) {
    foreach ($match[1] as $row) {
     $domain = getDomain($row);
   if ($domain != '')
   {
    db_insert_url($domain);
   }
    }
 }
}

function db_insert_url($url)
{
 $url = mysql_real_escape_string($url);
 $sql = "INSERT INTO tbl_url (url, html, status) VALUES ('$url', '', '0')";
 $rs = mysql_query($sql);
}

function db_update_html($url, $html)
{
 $url = mysql_real_escape_string($url);
 $html = mysql_real_escape_string($html);
 $sql = "UPDATE tbl_url SET html='$html' WHERE url='$url' ";
 $rs = mysql_query($sql);
}

function getDomain($url)
{
 $result = '';
 if (preg_match("/^(http:\/\/[\d|\w|-|_|.]+)/i", $url, $match)) {
  $result = $match[1];
 }
 return $result;
}
?>

Jangan lupa buat tabelnya juga di databasenya

CREATE TABLE `tbl_url` (
  `id` int(8) NOT NULL AUTO_INCREMENT,
  `url` varchar(128) NOT NULL,
  `html` text NOT NULL,
  `status` int(1) NOT NULL DEFAULT '0',
  PRIMARY KEY (`id`),
  UNIQUE KEY `url` (`url`)
) ENGINE=MyISAM;

sumber : http://phpkita.wordpress.com


EmoticonEmoticon