小强:thinkphp文章采集模块开发之(三)采集时所用到的方法
博主在此介绍下在文章采集时所用到的我自认为挺重要的方法,其中有用到snoopy.class.php这个文件,大家自己在网上下载就OK了,
一、远程图片下载到本地的方法
/**
* 下载图片
*
* @access public
* @param string $gurl 地址
* @param string $rfurl 来源地址
* @param string $filename 文件名
* @param string $gcookie 调整cookie
* @param string $JumpCount 跳转计数
* @param string $maxtime 最大次数
* @return string
*/
function DownImageKeep($gurl, $rfurl, $filename, $gcookie="", $JumpCount=0, $maxtime=30)
{
$urlinfos = GetHostInfo($gurl);
$ghost = trim($urlinfos['host']);
if($ghost=='')
{
return FALSE;
}
$gquery = $urlinfos['query'];
if($gcookie=="" && !empty($rfurl))
{
$gcookie = RefurlCookie($rfurl);
}
$sessionQuery = "GET $gquery HTTP/1.1\r\n";
$sessionQuery .= "Host: $ghost\r\n";
$sessionQuery .= "Referer: $rfurl\r\n";
$sessionQuery .= "Accept: */*\r\n";
$sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
if($gcookie!="" && !preg_match("/[\r\n]/", $gcookie))
{
$sessionQuery .= $gcookie."\r\n";
}
$sessionQuery .= "Connection: Keep-Alive\r\n\r\n";
$errno = "";
$errstr = "";
$m_fp = fsockopen($ghost, 80, $errno, $errstr,10);
fwrite($m_fp,$sessionQuery);
$lnum = 0;
//获取详细应答头
$m_httphead = Array();
$httpstas = explode(" ",fgets($m_fp,256));
$m_httphead["http-edition"] = trim($httpstas[0]);
$m_httphead["http-state"] = trim($httpstas[1]);
while(!feof($m_fp))
{
$line = trim(fgets($m_fp,256));
if($line == "" || $lnum>100)
{
break;
}
$hkey = "";
$hvalue = "";
$v = 0;
for($i=0; $i$maxtime)
{
break;
}
//到达指定大小结束
if($i >= $contentLength)
{
break;
}
}
if($okdata!="")
{
fwrite($fp,$okdata);
}
fclose($fp);
if($okdata=="")
{
@unlink($filename);
fclose($m_fp);
return FALSE;
}
fclose($m_fp);
return TRUE;
}
二、分析rss里的链接
/**
* 分析RSS里的链接
*
* @access public
* @param string $rssurl rss地址
* @return string
*/
function GetRssLinks($rssurl)
{
global $snoopy;
$cfg_soft_lang = 'utf-8';
$snoopy->fetch($rssurl);
//$dhd->OpenUrl($rssurl);
$rsshtml = $snoopy->results;
//分析编码
preg_match("/encoding=[\"']([^\"']*)[\"']/is",$rsshtml,$infos);
if(isset($infos[1]))
{
$pcode = strtolower(trim($infos[1]));
}
else
{
$pcode = strtolower($cfg_soft_lang);
}
if($cfg_soft_lang=='gb2312')
{
if($pcode=='utf-8')
{
$rsshtml = utf82gb($rsshtml);
}
else if($pcode=='big5')
{
$rsshtml = big52gb($rsshtml);
}
}
else if($cfg_soft_lang=='utf-8')
{
if($pcode=='gbk'||$pcode=='gb2312')
{
$rsshtml = gb2utf8($rsshtml);
}
else if($pcode=='big5')
{
$rsshtml = gb2utf8(big52gb($rsshtml));
}
}
$rsarr = array();
preg_match_all("/(.*)<\/title>/isU",$rsshtml,$titles);
preg_match_all("/(.*)<\/link>/isU",$rsshtml,$links);
preg_match_all("/(.*)<\/description>/isU",$rsshtml,$descriptions);
if(!isset($links[2]))
{
return '';
}
foreach($links[2] as $k=>$v)
{
$rsarr[$k]['link'] = RpCdata($v);
if(isset($titles[2][$k]))
{
$rsarr[$k]['title'] = RpCdata($titles[2][$k]);
}
else
{
$rsarr[$k]['title'] = preg_replace("/^(.*)\//i", "", RpCdata($titles[2][$k]));
}
if(isset($descriptions[2][$k]))
{
$rsarr[$k]['image'] = GetddImgFromRss($descriptions[2][$k],$rssurl);
}
else
{
$rsarr[$k]['image'] = '';
}
}
return $rsarr;
}
三、从RSS摘要获取图片信息
/**
* 从RSS摘要获取图片信息
*
* @access public
* @param string $descriptions 描述
* @param string $refurl 来源地址
* @return string
*/
function GetddImgFromRss($descriptions,$refurl)
{
if($descriptions=='')
{
return '';
}
preg_match_all("/ \r\n\t]{1,}/isU",$descriptions,$imgs);
if(isset($imgs[2][0]))
{
$imgs[2][0] = preg_replace("/[\"']/", '', $imgs[2][0]);
$imgs[2][0] = preg_replace("/\/{1,}/", '/', $imgs[2][0]);
return FillUrl($refurl,$imgs[2][0]);
}
else
{
return '';
}
}
四、补全网址
/**
* 补全网址
*
* @access public
* @param string $refurl 来源地址
* @param string $surl 站点地址
* @return string
*/
function FillUrl($refurl,$surl)
{
$i = $pathStep = 0;
$dstr = $pstr = $okurl = '';
$refurl = trim($refurl);
$surl = trim($surl);
$urls = @parse_url($refurl);
$basehost = ( (!isset($urls['port']) || $urls['port']=='80') ? $urls['host'] : $urls['host'].':'.$urls['port']);
//$basepath = $basehost.(!isset($urls['path']) ? '' : '/'.$urls['path']);
//由于直接获得的path在处理 http://xxxx/nnn/aaa?fdsafd 这种情况时会有错误,因此用其它方式处理
$basepath = $basehost;
$paths = explode('/',preg_replace("/^http:\/\//i", "", $refurl));
$n = count($paths);
for($i=1;$i < ($n-1);$i++)
{
if(!preg_match("/[\?]/", $paths[$i])) $basepath .= '/'.$paths[$i];
}
if(!preg_match("/[\?\.]/", $paths[$n-1]))
{
$basepath .= '/'.$paths[$n-1];
}
if($surl=='')
{
return $basepath;
}
$pos = strpos($surl, "#");
if($pos>0)
{
$surl = substr($surl, 0, $pos);
}
//用 '/' 表示网站根的网址
if($surl[0]=='/')
{
$okurl = $basehost.$surl;
}
else if($surl[0]=='.')
{
if(strlen($surl)<=2)
{
return '';
}
else if($surl[1]=='/')
{
$okurl = $basepath.preg_replace('/^./', '', $surl);
}
else
{
$okurl = $basepath.'/'.$surl;
}
}
else
{
if( strlen($surl) < 7 )
{
$okurl = $basepath.'/'.$surl;
}
else if( preg_match("/^http:\/\//i",$surl) )
{
$okurl = $surl;
}
else
{
$okurl = $basepath.'/'.$surl;
}
}
$okurl = preg_replace("/^http:\/\//i", '', $okurl);
$okurl = 'http://'.preg_replace("/\/{1,}/", '/', $okurl);
return $okurl;
}
五、删除HTML的指定标签属性
/**
* 删除HTML的指定标签属性
*
* @param string $c HTML文本
* $this->deny_attr 为要去掉的标签属性,例如:style,border,on(以on开头的属性),都要用英文半角,分开
* @return string
*/
function del_attr($c)
{
if ($this->deny_attr){
preg_match_all("/<\w(.*?)>/is", $c, $arrayBuffer);
if ($arrayBuffer[0]){
$d_arr = explode(',', $this->deny_attr);
foreach ($d_arr as $k=>$v){
$d_arr[$k] = "/\s+$v(\w)*+=.+?(?=\s|>)/is";
}
foreach($arrayBuffer[0] as $p){
$r = preg_replace($d_arr, "", $p);
$c = str_replace($p, $r, $c);
}
return $c;
}else{
return $c;
}
}else{
return $c;
}
}