现在的位置: 首页 > 综合 > 正文

用PHP抓取网页

2013年02月15日 ⁄ 综合 ⁄ 共 6228字 ⁄ 字号 评论关闭

用PHP抓取网页

 
作者: 共创联盟
加入时间: 2003-11-24
浏览次数: 322

  抓取网页,并将文字和图片存入数据库中,利用getimg.php?id=读取数据库中的图片
getarticle.php?id=读取文档

<?

/**建表文档 articletype对应的类型 1:oracle,2:java,3:system
CREATE TABLE article (
  id int(6) NOT NULL auto_increment,
  title varchar(80) default NULL,
  content text,
  url varchar(80) default NULL,
  joindate varchar(12) default NULL,
  articletype int(2) not null,
  PRIMARY KEY  (id)
) ;
CREATE TABLE images (
  id int(4) NOT NULL auto_increment,
  bin_data longblob,
  filetype varchar(50) default NULL,
  title varchar(50) default NULL,
  articleid int(6) NOT NULL,
  PRIMARY KEY  (id)
) TYPE=MyISAM;
*/

class SaveWeb
{
var $title;
var $url;
var $typeid;
var $content;
var $getUrl = true;
var $getimg = "getimg.php?id=";
var $dbuser = "root";
var $dbpassword = "whf76128";
var $dbname = "tech";
var $dbhost = "127.0.0.1";

function SaveWeb($title,$url,$typeid) //初始化,
{
$this->title=$title;
$this->url=$url;
$this->typeid=$typeid;
}
function setContent($html) //初始化,
{
$this->content = $html;
$this->getUrl = false;
}
function saveContent() //直接存储段落文字
{
$date = gmdate("Y-m-d");
$data = nl2br($this->content);
$data = addslashes($data);
MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);
mysql_select_db( $this->dbname);
$result=MYSQL_QUERY( "INSERT INTO article (title,content,url,joindate,articletype)  VALUES ('$this->title','$data','$this->url','$date',$this->typeid)");
$id= mysql_insert_id();
MYSQL_CLOSE();
return $id;
}
function webSave() //存储页面
{
if($this->title==""||$this->url=="")
return false;
if($this->getUrl==true)
$text = $this->getHtml($this->url);
else
{
$text = $this->content;
}
$text2 = $this->parserHtml($text);
$id = $this->saveHtml($text2);
$this->updateImgPID($id,$this->title);
$this->delimg();
return $id;
}

//在$strobj中查找$strchild,返回值为位置(找到)和false(没有找到相应的字符串).
function strfind($strobj,$strchild,$int)
{
$intobj=strlen($strobj);
$intchild=strlen($strchild);

while($int<=$intobj)
{
if(strtolower(substr($strobj,$int,1))==$strchild[0]) //当从$strobj上截取的首字符与$strchild的首字符相同时,作进一步判断.
{
if(strtolower(substr($strobj,$int,$intchild))==$strchild)
return $int;
}
$int++;
}
return false;
}

function getHtml($url)
{
if(($fp = fopen($url,"r"))==false)
{
echo "<font color=red>读取失败,文件位置:$url</font><br>";
return false;
}

$data = "";
while(!feof($fp))
{
$data = $data.fread($fp,512);
}
fclose($fp);
return $data;
}
function delImg()
{
MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);
mysql_select_db( $this->dbname);
$result=MYSQL_QUERY( "delete from images where articleid = 0");
MYSQL_CLOSE();
}
function updateImgPID($id,$title)
{
MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);
mysql_select_db( $this->dbname);
MYSQL_QUERY( "update images set articleid = $id where title='$title'");
MYSQL_CLOSE();
}

function saveHtml($data)
{
$date = gmdate("Y-m-d");
$data = addslashes($data);
MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);
mysql_select_db( $this->dbname);
$result=MYSQL_QUERY( "INSERT INTO article (title,content,url,joindate,articletype)  VALUES ('$this->title','$data','$this->url','$date',$this->typeid)");
$id= mysql_insert_id();
MYSQL_CLOSE();
return $id;
}

function saveImg($url)
{
$data = $this->getHtml($url);
$data = addslashes($data);
MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);
mysql_select_db( $this->dbname);
$result=MYSQL_QUERY( "INSERT INTO images (bin_data,filetype,title,articleid)  VALUES ('$data','".$this->getContentType($url)."','$this->title',0)");
$id= mysql_insert_id();
MYSQL_CLOSE();
return $id;
}

function getContentName($inFileName)
{
return basename($inFileName);
}
function getContentType($inFileName)
{
//--剥去路径
$inFileName = basename($inFileName);
//--检查文件扩展名
if(strrchr($inFileName, ".") == false)
{
return  "application/octet-stream";
}
//--得到文件扩展名,并判断文件类型
$extension = strrchr($inFileName, ".");
switch($extension)
{
case  ".gif": return  "image/gif";
case  ".gz": return  "application/x-gzip";
case  ".htm": return  "text/html";
case  ".html": return  "text/html";
case  ".jpg": return  "image/jpeg";
case  ".tar": return  "application/x-tar";
case  ".txt": return  "text/plain";
case  ".zip": return  "application/zip";
case  ".png": return "image/png";
case  ".bmp": return "image/bmp";
default:        return  "application/octet-stream";
}
return  "application/octet-stream";
}

function parserHtml($text)
{
$int = 0;
$baseUrl = parse_url($this->url);
$urlHost = "http://".$baseUrl["host"];
$urlDir = $urlHost.dirname($baseUrl["path"]);
$urlDir = str_replace("//","/",$urlDir);
//更新<img>标签
while($int = $this->strfind($text,"<img",$int))
{
$closeCharPos = $this->strfind($text,">",$int);
$tmpTxt = substr($text,$int,$closeCharPos-$int+1);
$srcStart = $this->strfind($tmpTxt,"src=",0);
$srcEnd = 0;
switch(substr($tmpTxt,$srcStart+4,1))
{
case '"':
$srcEnd = $this->strfind($tmpTxt,'"',$srcStart+5);
$imgUrl = substr($tmpTxt,$srcStart+5,$srcEnd-$srcStart-5);
break;
case "'":
$srcEnd = $this->strfind($tmpTxt,"'",$srcStart+5);
$imgUrl = substr($tmpTxt,$srcStart+5,$srcEnd-$srcStart-5);
break;
default:
$srcEnd = $this->strfind($tmpTxt," ",$srcStart+4);
if($srcEnd == false)
$srcEnd = $this->strfind($tmpTxt,'>',$srcStart+4);
$imgUrl = substr($tmpTxt,$srcStart+4,$srcEnd-$srcStart-4);
}
$tempImgUrl = $imgUrl;
$tempFile = parse_url($this->getimg);

if($this->strfind($tmpTxt,"http://",0)!=true)
{
switch(substr($imgUrl,0,1))
{
case "/":
$imgUrl = $urlHost.$imgUrl;
break;
default:
if(substr($urlDir,strlen($urlDir)-1,1)=="/")
$imgUrl = $urlDir.$imgUrl;
else
$imgUrl = $urlDir."/".$imgUrl;
}
}

if($this->strfind($imgUrl,$tempFile["path"],0)!=false)
{
$int++;
continue;
}
$id = $this->saveImg($imgUrl);
if($id == false)
{
$int++;
continue;
}
$newImgUrl = $this->getimg.$id;
$text = str_replace($tempImgUrl,$newImgUrl,$text);
$int++;
}
$int = 0;
//更新<a></a>标签
while($int = $this->strfind($text,"<a",$int))
{
$closeCharPos = $this->strfind($text,">",$int);
$tmpTxt = substr($text,$int,$closeCharPos-$int+1);
$srcStart = $this->strfind($tmpTxt,"href=",0);
$srcEnd = 0;
switch(substr($tmpTxt,$srcStart+5,1))
{
case '"':
$srcEnd = $this->strfind($tmpTxt,'"',$srcStart+6);
$imgUrl = substr($tmpTxt,$srcStart+6,$srcEnd-$srcStart-6);
break;
case "'":
$srcEnd = $this->strfind($tmpTxt,"'",$srcStart+6);
$imgUrl = substr($tmpTxt,$srcStart+6,$srcEnd-$srcStart-6);
break;
default:
$srcEnd = $this->strfind($tmpTxt," ",$srcStart+5);
if($srcEnd == false)
$srcEnd = $this->strfind($tmpTxt,'>',$srcStart+5);
$imgUrl = substr($tmpTxt,$srcStart+5,$srcEnd-$srcStart-5);
}
$tempImgUrl = $imgUrl;
if($this->strfind($tmpTxt,"http://",0)!=true)
{
switch(substr($imgUrl,0,1))
{
case "/":
$imgUrl = $urlHost.$imgUrl;
break;
default:
if(substr($urlDir,strlen($urlDir)-1,1)=="/")
$imgUrl = $urlDir.$imgUrl;
else
$imgUrl = $urlDir."/".$imgUrl;
}
$text = str_replace($tempImgUrl,$imgUrl,$text);
}
$int++;
}
return $text;
}
}

?>

抱歉!评论已关闭.