小强:thinkphp文章采集模块开发之(三)采集时所用到的方法

作者: admin 分类: php 发布时间: 2012-09-11 09:40

博主在此介绍下在文章采集时所用到的我自认为挺重要的方法,其中有用到snoopy.class.php这个文件,大家自己在网上下载就OK了,

一、远程图片下载到本地的方法

/**
 *  下载图片
 *
 * @access    public
 * @param     string  $gurl  地址
 * @param     string  $rfurl  来源地址
 * @param     string  $filename  文件名
 * @param     string  $gcookie  调整cookie
 * @param     string  $JumpCount  跳转计数
 * @param     string  $maxtime  最大次数
 * @return    string
 */
function DownImageKeep($gurl, $rfurl, $filename, $gcookie="", $JumpCount=0, $maxtime=30)
{
    $urlinfos = GetHostInfo($gurl);
    $ghost = trim($urlinfos['host']);
    if($ghost=='')
    {
        return FALSE;
    }
    $gquery = $urlinfos['query'];
    if($gcookie=="" && !empty($rfurl))
    {
        $gcookie = RefurlCookie($rfurl);
    }
    $sessionQuery = "GET $gquery HTTP/1.1\r\n";
    $sessionQuery .= "Host: $ghost\r\n";
    $sessionQuery .= "Referer: $rfurl\r\n";
    $sessionQuery .= "Accept: */*\r\n";
    $sessionQuery .= "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98)\r\n";
    if($gcookie!="" && !preg_match("/[\r\n]/", $gcookie))
    {
        $sessionQuery .= $gcookie."\r\n";
    }
    $sessionQuery .= "Connection: Keep-Alive\r\n\r\n";
    $errno = "";
    $errstr = "";
    $m_fp = fsockopen($ghost, 80, $errno, $errstr,10);
    fwrite($m_fp,$sessionQuery);
    $lnum = 0;

    //获取详细应答头
    $m_httphead = Array();
    $httpstas = explode(" ",fgets($m_fp,256));
    $m_httphead["http-edition"] = trim($httpstas[0]);
    $m_httphead["http-state"] = trim($httpstas[1]);
    while(!feof($m_fp))
    {
        $line = trim(fgets($m_fp,256));
        if($line == "" || $lnum>100)
        {
            break;
        }
        $hkey = "";
        $hvalue = "";
        $v = 0;
        for($i=0; $i$maxtime)
        {
            break;
        }

        //到达指定大小结束
        if($i >= $contentLength)
        {
            break;
        }
    }
    if($okdata!="")
    {
        fwrite($fp,$okdata);
    }
    fclose($fp);
    if($okdata=="")
    {
        @unlink($filename);
        fclose($m_fp);
        return FALSE;
    }
    fclose($m_fp);
    return TRUE;
}

二、分析rss里的链接

/**
 *  分析RSS里的链接
 *
 * @access    public
 * @param     string  $rssurl  rss地址
 * @return    string
 */
function GetRssLinks($rssurl)
{
	global $snoopy;
    $cfg_soft_lang = 'utf-8';
    $snoopy->fetch($rssurl);
    //$dhd->OpenUrl($rssurl);
    $rsshtml = $snoopy->results;

    //分析编码
    preg_match("/encoding=[\"']([^\"']*)[\"']/is",$rsshtml,$infos);
    if(isset($infos[1]))
    {
        $pcode = strtolower(trim($infos[1]));
    }
    else
    {
        $pcode = strtolower($cfg_soft_lang);
    }
    if($cfg_soft_lang=='gb2312')
    {
        if($pcode=='utf-8')
        {
            $rsshtml = utf82gb($rsshtml);
        }
        else if($pcode=='big5')
        {
            $rsshtml = big52gb($rsshtml);
        }
    }
    else if($cfg_soft_lang=='utf-8')
    {
        if($pcode=='gbk'||$pcode=='gb2312')
        {
            $rsshtml = gb2utf8($rsshtml);
        }
        else if($pcode=='big5')
        {
            $rsshtml = gb2utf8(big52gb($rsshtml));
        }
    }
    $rsarr = array();
    preg_match_all("/(.*)<\/title>/isU",$rsshtml,$titles);
    preg_match_all("/(.*)<\/link>/isU",$rsshtml,$links);
    preg_match_all("/(.*)<\/description>/isU",$rsshtml,$descriptions);
    if(!isset($links[2]))
    {
        return '';
    }
    foreach($links[2] as $k=>$v)
    {
        $rsarr[$k]['link'] = RpCdata($v);

        if(isset($titles[2][$k]))
        {
            $rsarr[$k]['title'] = RpCdata($titles[2][$k]);
        }
        else
        {
            $rsarr[$k]['title'] = preg_replace("/^(.*)\//i", "", RpCdata($titles[2][$k]));
        }
        if(isset($descriptions[2][$k]))
        {
            $rsarr[$k]['image'] = GetddImgFromRss($descriptions[2][$k],$rssurl);
        }
        else
        {
            $rsarr[$k]['image'] = '';
        }
    }
    return $rsarr;
}

三、从RSS摘要获取图片信息

/**
 *  从RSS摘要获取图片信息
 *
 * @access    public
 * @param     string  $descriptions  描述
 * @param     string  $refurl  来源地址
 * @return    string
 */
function GetddImgFromRss($descriptions,$refurl)
{
    if($descriptions=='')
    {
        return '';
    }
    preg_match_all("/ \r\n\t]{1,}/isU",$descriptions,$imgs);
    if(isset($imgs[2][0]))
    {
        $imgs[2][0] = preg_replace("/[\"']/", '', $imgs[2][0]);
        $imgs[2][0] = preg_replace("/\/{1,}/", '/', $imgs[2][0]);
        return FillUrl($refurl,$imgs[2][0]);
    }
    else
    {
        return '';
    }
}

四、补全网址

/**
 *  补全网址
 *
 * @access    public
 * @param     string  $refurl  来源地址
 * @param     string  $surl  站点地址
 * @return    string
 */
function FillUrl($refurl,$surl)
{
    $i = $pathStep = 0;
    $dstr = $pstr = $okurl = '';
    $refurl = trim($refurl);
    $surl = trim($surl);
    $urls = @parse_url($refurl);
    $basehost = ( (!isset($urls['port']) || $urls['port']=='80') ? $urls['host'] : $urls['host'].':'.$urls['port']);

    //$basepath = $basehost.(!isset($urls['path']) ? '' : '/'.$urls['path']);
    //由于直接获得的path在处理 http://xxxx/nnn/aaa?fdsafd 这种情况时会有错误,因此用其它方式处理
    $basepath = $basehost;
    $paths = explode('/',preg_replace("/^http:\/\//i", "", $refurl));
    $n = count($paths);
    for($i=1;$i < ($n-1);$i++)
    {
        if(!preg_match("/[\?]/", $paths[$i])) $basepath .= '/'.$paths[$i];
    }
    if(!preg_match("/[\?\.]/", $paths[$n-1]))
    {
        $basepath .= '/'.$paths[$n-1];
    }
    if($surl=='')
    {
        return $basepath;
    }
    $pos = strpos($surl, "#");
    if($pos>0)
    {
        $surl = substr($surl, 0, $pos);
    }

    //用 '/' 表示网站根的网址
    if($surl[0]=='/')
    {
        $okurl = $basehost.$surl;
    }
    else if($surl[0]=='.')
    {
        if(strlen($surl)<=2)
        {
            return '';
        }
        else if($surl[1]=='/')
        {
            $okurl = $basepath.preg_replace('/^./', '', $surl);
        }
        else
        {
            $okurl = $basepath.'/'.$surl;
        }
    }
    else
    {
        if( strlen($surl) < 7 )
        {
            $okurl = $basepath.'/'.$surl;
        }
        else if( preg_match("/^http:\/\//i",$surl) )
        {
            $okurl = $surl;
        }
        else
        {
            $okurl = $basepath.'/'.$surl;
        }
    }
    $okurl = preg_replace("/^http:\/\//i", '', $okurl);
    $okurl = 'http://'.preg_replace("/\/{1,}/", '/', $okurl);
    return $okurl;
}

五、删除HTML的指定标签属性

/**
 *  删除HTML的指定标签属性
 *
 * @param     string  $c  HTML文本
 * $this->deny_attr  为要去掉的标签属性,例如:style,border,on(以on开头的属性),都要用英文半角,分开
 * @return    string
 */
function del_attr($c)
	{
		if ($this->deny_attr){
			preg_match_all("/<\w(.*?)>/is", $c, $arrayBuffer);
			if ($arrayBuffer[0]){
				$d_arr = explode(',', $this->deny_attr);
				foreach ($d_arr as $k=>$v){
					$d_arr[$k] = "/\s+$v(\w)*+=.+?(?=\s|>)/is";
				}
				foreach($arrayBuffer[0] as $p){
					$r = preg_replace($d_arr, "", $p);
					$c = str_replace($p, $r, $c);
				}
				return $c;
			}else{
				return $c;
			}
		}else{
			return $c;
		}
	}

来源:http://www.baidu3k.com/archives/637