Estou com o seguinte problema, estou montando um motor de busca mais quando a codificação do site a ser escaneado é diferente retorna erros.
Arrumei uma função conversora para utf8 só que me parece não estar funcionando.
function sanitizar_utf8($texto) {
$saida = '';
$i = 0;
$len = strlen($texto);
while ($i < $len) {
$char = $texto[$i++];
$ord = ord($char);
// Primeiro byte 0xxxxxxx: simbolo ascii possui 1 byte
if (($ord & 0x80) == 0x00) {
// Se e' um caractere de controle
if (($ord >= 0 && $ord <= 31) || $ord == 127) {
// Incluir se for: tab, retorno de carro ou quebra de linha
if ($ord == 9 || $ord == 10 || $ord == 13) {
$saida .= $char;
}
// Simbolo ASCII
} else {
$saida .= $char;
}
// Primeiro byte 110xxxxx ou 1110xxxx ou 11110xxx: simbolo possui 2, 3 ou 4 bytes
} else {
// Determinar quantidade de bytes analisando os bits da esquerda para direita
$bytes = 0;
for ($b = 7; $b >= 0; $b--) {
$bit = $ord & (1 << $b);
if ($bit) {
$bytes += 1;
} else {
break;
}
}
switch ($bytes) {
case 2: // 110xxxxx 10xxxxxx
case 3: // 1110xxxx 10xxxxxx 10xxxxxx
case 4: // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
$valido = true;
$saida_padrao = $char;
$i_inicial = $i;
for ($b = 1; $b < $bytes; $b++) {
if (!isset($texto[$i])) {
$valido = false;
break;
}
$char_extra = $texto[$i++];
$ord_extra = ord($char_extra);
if (($ord_extra & 0xC0) == 0x80) {
$saida_padrao .= $char_extra;
} else {
$valido = false;
break;
}
}
if ($valido) {
$saida .= $saida_padrao;
} else {
$saida .= ($ord < 0x7F || $ord > 0x9F) ? utf8_encode($char) : '';
$i = $i_inicial;
}
break;
case 1: // 10xxxxxx: ISO-8859-1
default: // 11111xxx: ISO-8859-1
$saida .= ($ord < 0x7F || $ord > 0x9F) ? utf8_encode($char) : '';
break;
}
}
}
return $saida;
}
?>
Tento implementa-la no motor de busca, mais me parece que ela simplesmente não faz efeito nenhum.
ratrear.php
<?php
set_time_limit (0);
$include_dir = "../include";
include "auth.php";
require_once ("$include_dir/commonfuncs.php");
$all = 0;
extract (getHttpVars());
$settings_dir = "../settings";
require_once ("$settings_dir/conf.php");
include "messages.php";
include "rastrearfuncs.php";
error_reporting (E_ALL ^ E_NOTICE ^ E_WARNING);
$delay_time = 0;
$command_line = 0;
if (isset($_SERVER['argv']) && $_SERVER['argc'] >= 2) {
$command_line = 1;
$ac = 1; //argument counter
while ($ac < (count($_SERVER['argv']))) {
$arg = $_SERVER['argv'][$ac];
if ($arg == '-all') {
$all = 1;
break;
} else if ($arg == '-u') {
$url = $_SERVER['argv'][$ac+1];
$ac= $ac+2;
} else if ($arg == '-f') {
$soption = 'full';
$ac++;
} else if ($arg == '-d') {
$soption = 'level';
$maxlevel = $_SERVER['argv'][$ac+1];;
$ac= $ac+2;
} else if ($arg == '-l') {
$domaincb = 1;
$ac++;
} else if ($arg == '-r') {
$reindex = 1;
$ac++;
} else if ($arg == '-m') {
$in = str_replace("\\n", chr(10), $_SERVER['argv'][$ac+1]);
$ac= $ac+2;
} else if ($arg == '-n') {
$out = str_replace("\\n", chr(10), $_SERVER['argv'][$ac+1]);
$ac= $ac+2;
} else {
commandline_help();
die();
}
}
}
if (isset($soption) && $soption == 'full') {
$maxlevel = -1;
}
if (!isset($domaincb)) {
$domaincb = 0;
}
if(!isset($reindex)) {
$reindex=0;
}
if(!isset($maxlevel)) {
$maxlevel=0;
}
if ($keep_log) {
if ($log_format=="html") {
$log_file = $log_dir."/".Date("ymdHi").".html";
} else {
$log_file = $log_dir."/".Date("ymdHi").".log";
}
if (!$log_handle = fopen($log_file, 'w')) {
die ("Logging option is set, but cannot open file for logging.");
}
}
if ($all == 1) {
index_all();
} else {
if ($reindex == 1 && $command_line == 1) {
$result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites where url='$url'");
echo mysql_error();
if($row=mysql_fetch_row($result)) {
$url = $row[0];
$maxlevel = $row[1];
$in= $row[2];
$out = $row[3];
$domaincb = $row[4];
if ($domaincb=='') {
$domaincb=0;
}
if ($maxlevel == -1) {
$soption = 'full';
} else {
$soption = 'level';
}
}
}
if (!isset($in)) {
$in = "";
}
if (!isset($out)) {
$out = "";
}
index_site($url, $reindex, $maxlevel, $soption, $in, $out, $domaincb);
}
$tmp_urls = Array();
function microtime_float(){
list($usec, $sec) = explode(" ", microtime());
return ((float)$usec + (float)$sec);
}
function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex) {
global $entities, $min_delay;
global $command_line;
global $min_words_per_page;
global $supdomain;
global $mysql_table_prefix, $user_agent, $tmp_urls, $delay_time, $domain_arr;
$needsReindex = 1;
$deletable = 0;
$url_status = url_status($url);
$thislevel = $level - 1;
if (strstr($url_status['state'], "Relocation")) {
$url = preg_replace("/ /", "", url_purify($url_status['path'], $url, $can_leave_domain));
if ($url <> '') {
$result = mysql_query("select link from ".$mysql_table_prefix."temp where link='$url' && id = '$sessid'");
echo mysql_error();
$rows = mysql_numrows($result);
if ($rows == 0) {
mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', '$level', '$sessid')");
echo mysql_error();
}
}
$url_status['state'] == "redirected";
}
/*
if ($indexdate <> '' && $url_status['date'] <> '') {
if ($indexdate > $url_status['date']) {
$url_status['state'] = "Date checked. Page contents not changed";
$needsReindex = 0;
}
}*/
ini_set("user_agent", $user_agent);
if ($url_status['state'] == 'ok') {
$OKtoIndex = 1;
$file_read_error = 0;
if (time() - $delay_time < $min_delay) {
sleep ($min_delay- (time() - $delay_time));
}
$delay_time = time();
if (!fst_lt_snd(phpversion(), "4.3.0")) {
$file = file_get_contents($url);
if ($file === FALSE) {
$file_read_error = 1;
}
} else {
$fl = @fopen($url, "r");
if ($fl) {
while ($buffer = @fgets($fl, 4096)) {
$file .= $buffer;
}
} else {
$file_read_error = 1;
}
fclose ($fl);
}
if ($file_read_error) {
$contents = getFileContents($url);
$file = $contents['file'];
}
$pageSize = number_format(strlen($file)/1024, 2, ".", "");
printPageSizeReport($pageSize);
if ($url_status['content'] != 'text') {
$file = extract_text($file, $url_status['content']);
}
printStandardReport('starting', $command_line);
$newmd5sum = md5($file);
if ($md5sum == $newmd5sum) {
printStandardReport('md5notChanged',$command_line);
$OKtoIndex = 0;
} else if (isDuplicateMD5($newmd5sum)) {
$OKtoIndex = 0;
printStandardReport('duplicate',$command_line);
}
if (($md5sum != $newmd5sum || $reindex ==1) && $OKtoIndex == 1) {
$urlparts = parse_url($url);
$newdomain = $urlparts['host'];
$type = 0;
/* if ($newdomain <> $domain)
$domainChanged = 1;
if ($domaincb==1) {
$start = strlen($newdomain) - strlen($supdomain);
if (substr($newdomain, $start) == $supdomain) {
$domainChanged = 0;
}
}*/
// remove link to css file
//get all links from file
$data = clean_file($file, $url, $url_status['content']);
if ($data['noindex'] == 1) {
$OKtoIndex = 0;
$deletable = 1;
printStandardReport('metaNoindex',$command_line);
}
$wordarray = unique_array(explode(" ", $data['content']));
if ($data['nofollow'] != 1) {
$links = get_links($file, $url, $can_leave_domain, $data['base']);
$links = distinct_array($links);
$all_links = count($links);
$numoflinks = 0;
//if there are any, add to the temp table, but only if there isnt such url already
if (is_array($links)) {
reset ($links);
while ($thislink = each($links)) {
if ($tmp_urls[$thislink[1]] != 1) {
$tmp_urls[$thislink[1]] = 1;
$numoflinks++;
mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$thislink[1]', '$level', '$sessid')");
echo mysql_error();
}
}
}
} else {
printStandardReport('noFollow',$command_line);
}
if ($OKtoIndex == 1) {
$title = $data['title'];
$host = $data['host'];
$path = $data['path'];
$fulltxt = $data['fulltext'];
$desc = substr($data['description'], 0,254);
$url_parts = parse_url($url);
$domain_for_db = $url_parts['host'];
if (isset($domain_arr[$domain_for_db])) {
$dom_id = $domain_arr[$domain_for_db];
} else {
mysql_query("insert into ".$mysql_table_prefix."domains (domain) values ('$domain_for_db')");
$dom_id = mysql_insert_id();
$domain_arr[$domain_for_db] = $dom_id;
}
$wordarray = calc_weights ($wordarray, $title, $host, $path, $data['keywords']);
//if there are words to index, add the link to the database, get its id, and add the word + their relation
if (is_array($wordarray) && count($wordarray) > $min_words_per_page) {
if ($md5sum == '') {
mysql_query ("insert into ".$mysql_table_prefix."links (site_id, url, title, description, fulltxt, indexdate, size, md5sum, level) values ('$site_id', '$url', '$title', '$desc', '$fulltxt', curdate(), '$pageSize', '$newmd5sum', $thislevel)");
echo mysql_error();
$result = mysql_query("select link_id from ".$mysql_table_prefix."links where url='$url'");
echo mysql_error();
$row = mysql_fetch_row($result);
$link_id = $row[0];
save_keywords($wordarray, $link_id, $dom_id);
printStandardReport('indexed', $command_line);
}else if (($md5sum <> '') && ($md5sum <> $newmd5sum)) { //if page has changed, start updating
$result = mysql_query("select link_id from ".$mysql_table_prefix."links where url='$url'");
echo mysql_error();
$row = mysql_fetch_row($result);
$link_id = $row[0];
for ($i=0;$i<=15; $i++) {
$char = dechex($i);
mysql_query ("delete from ".$mysql_table_prefix."link_keyword$char where link_id=$link_id");
echo mysql_error();
}
save_keywords($wordarray, $link_id, $dom_id);
$query = "update ".$mysql_table_prefix."links set title='$title', description ='$desc', fulltxt = '$fulltxt', indexdate=now(), size = '$pageSize', md5sum='$newmd5sum', level=$thislevel where link_id=$link_id";
mysql_query($query);
echo mysql_error();
printStandardReport('re-indexed', $command_line);
}
}else {
printStandardReport('minWords', $command_line);
}
}
}
} else {
$deletable = 1;
printUrlStatus($url_status['state'], $command_line);
}
if ($reindex ==1 && $deletable == 1) {
check_for_removal($url);
} else if ($reindex == 1) {
}
if (!isset($all_links)) {
$all_links = 0;
}
if (!isset($numoflinks)) {
$numoflinks = 0;
}
printLinksReport($numoflinks, $all_links, $command_line);
}
function index_site($url, $reindex, $maxlevel, $soption, $url_inc, $url_not_inc, $can_leave_domain) {
global $mysql_table_prefix, $command_line, $mainurl, $tmp_urls, $domain_arr, $all_keywords;
if (!isset($all_keywords)) {
$result = mysql_query("select keyword_ID, keyword from ".$mysql_table_prefix."keywords");
echo mysql_error();
while($row=mysql_fetch_array($result)) {
$all_keywords[addslashes($row[1])] = $row[0];
}
}
$compurl = parse_url($url);
if ($compurl['path'] == '')
$url = $url . "/";
$t = microtime();
$a = getenv("REMOTE_ADDR");
$sessid = md5 ($t.$a);
$urlparts = parse_url($url);
$domain = $urlparts['host'];
if (isset($urlparts['port'])) {
$port = (int)$urlparts['port'];
}else {
$port = 80;
}
$result = mysql_query("select site_id from ".$mysql_table_prefix."sites where url='$url'");
echo mysql_error();
$row = mysql_fetch_row($result);
$site_id = $row[0];
if ($site_id != "" && $reindex == 1) {
mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', 0, '$sessid')");
echo mysql_error();
$result = mysql_query("select url, level from ".$mysql_table_prefix."links where site_id = $site_id");
while ($row = mysql_fetch_array($result)) {
$site_link = $row['url'];
$link_level = $row['level'];
if ($site_link != $url) {
mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$site_link', $link_level, '$sessid')");
}
}
$qry = "update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth = $maxlevel, required = '$url_inc'," .
"disallowed = '$url_not_inc', can_leave_domain=$can_leave_domain where site_id=$site_id";
mysql_query ($qry);
echo mysql_error();
} else if ($site_id == '') {
mysql_query ("insert into ".$mysql_table_prefix."sites (url, indexdate, spider_depth, required, disallowed, can_leave_domain) " .
"values ('$url', now(), $maxlevel, '$url_inc', '$url_not_inc', $can_leave_domain)");
echo mysql_error();
$result = mysql_query("select site_ID from ".$mysql_table_prefix."sites where url='$url'");
$row = mysql_fetch_row($result);
$site_id = $row[0];
} else {
mysql_query ("update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth = $maxlevel, required = '$url_inc'," .
"disallowed = '$url_not_inc', can_leave_domain=$can_leave_domain where site_id=$site_id");
echo mysql_error();
}
$result = mysql_query("select site_id, temp_id, level, count, num from ".$mysql_table_prefix."pending where site_id='$site_id'");
echo mysql_error();
$row = mysql_fetch_row($result);
$pending = $row[0];
$level = 0;
$domain_arr = get_domains();
if ($pending == '') {
mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', 0, '$sessid')");
echo mysql_error();
} else if ($pending != '') {
printStandardReport('continueSuspended',$command_line);
mysql_query("select temp_id, level, count from ".$mysql_table_prefix."pending where site_id='$site_id'");
echo mysql_error();
$sessid = $row[1];
$level = $row[2];
$pend_count = $row[3] + 1;
$num = $row[4];
$pending = 1;
$tmp_urls = get_temp_urls($sessid);
}
if ($reindex != 1) {
mysql_query ("insert into ".$mysql_table_prefix."pending (site_id, temp_id, level, count) values ('$site_id', '$sessid', '0', '0')");
echo mysql_error();
}
$time = time();
$omit = check_robot_txt($url);
printHeader ($omit, $url, $command_line);
$mainurl = $url;
$num = 0;
while (($level <= $maxlevel && $soption == 'level') || ($soption == 'full')) {
if ($pending == 1) {
$count = $pend_count;
$pending = 0;
} else
$count = 0;
$links = array();
$result = mysql_query("select distinct link from ".$mysql_table_prefix."temp where level=$level && id='$sessid' order by link");
echo mysql_error();
$rows = mysql_num_rows($result);
if ($rows == 0) {
break;
}
$i = 0;
while ($row = mysql_fetch_array($result)) {
$links[] = $row['link'];
}
reset ($links);
while ($count < count($links)) {
$num++;
$thislink = $links[$count];
$urlparts = parse_url($thislink);
reset ($omit);
$forbidden = 0;
foreach ($omit as $omiturl) {
$omiturl = trim($omiturl);
$omiturl_parts = parse_url($omiturl);
if ($omiturl_parts['scheme'] == '') {
$check_omit = $urlparts['host'] . $omiturl;
} else {
$check_omit = $omiturl;
}
if (strpos($thislink, $check_omit)) {
printRobotsReport($num, $thislink, $command_line);
check_for_removal($thislink);
$forbidden = 1;
break;
}
}
if (!check_include($thislink, $url_inc, $url_not_inc )) {
printUrlStringReport($num, $thislink, $command_line);
check_for_removal($thislink);
$forbidden = 1;
}
if ($forbidden == 0) {
printRetrieving($num, $thislink, $command_line);
$query = "select md5sum, indexdate from ".$mysql_table_prefix."links where url='$thislink'";
$result = mysql_query($query);
echo mysql_error();
$rows = mysql_num_rows($result);
if ($rows == 0) {
index_url($thislink, $level+1, $site_id, '', $domain, '', $sessid, $can_leave_domain, $reindex);
mysql_query("update ".$mysql_table_prefix."pending set level = $level, count=$count, num=$num where site_id=$site_id");
echo mysql_error();
}else if ($rows <> 0 && $reindex == 1) {
$row = mysql_fetch_array($result);
$md5sum = $row['md5sum'];
$indexdate = $row['indexdate'];
index_url($thislink, $level+1, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex);
mysql_query("update ".$mysql_table_prefix."pending set level = $level, count=$count, num=$num where site_id=$site_id");
echo mysql_error();
}else {
printStandardReport('inDatabase',$command_line);
}
}
$count++;
}
$level++;
}
mysql_query ("delete from ".$mysql_table_prefix."temp where id = '$sessid'");
echo mysql_error();
mysql_query ("delete from ".$mysql_table_prefix."pending where site_id = '$site_id'");
echo mysql_error();
printStandardReport('completed',$command_line);
}
function index_all() {
global $mysql_table_prefix;
$result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain from ".$mysql_table_prefix."sites");
echo mysql_error();
while ($row=mysql_fetch_row($result)) {
$url = $row[0];
$depth = $row[1];
$include = $row[2];
$not_include = $row[3];
$can_leave_domain = $row[4];
if ($can_leave_domain=='') {
$can_leave_domain=0;
}
if ($depth == -1) {
$soption = 'full';
} else {
$soption = 'level';
}
index_site($url, 1, $depth, $soption, $include, $not_include, $can_leave_domain);
}
}
function get_temp_urls ($sessid) {
global $mysql_table_prefix;
$result = mysql_query("select link from ".$mysql_table_prefix."temp where id='$sessid'");
echo mysql_error();
$tmp_urls = Array();
while ($row=mysql_fetch_row($result)) {
$tmp_urls[$row[0]] = 1;
}
return $tmp_urls;
}
function get_domains () {
global $mysql_table_prefix;
$result = mysql_query("select domain_id, domain from ".$mysql_table_prefix."domains");
echo mysql_error();
$domains = Array();
while ($row=mysql_fetch_row($result)) {
$domains[$row[1]] = $row[0];
}
return $domains;
}
function commandline_help() {
print "Usage: php spider.php <options>\n\n";
print "Options:\n";
print " -all\t\t Reindex everything in the database\n";
print " -u <url>\t Set url to index\n";
print " -f\t\t Set indexing depth to full (unlimited depth)\n";
print " -d <num>\t Set indexing depth to <num>\n";
print " -l\t\t Allow spider to leave the initial domain\n";
print " -r\t\t Set spider to reindex a site\n";
print " -m <string>\t Set the string(s) that an url must include (use \\n as a delimiter between multiple strings)\n";
print " -n <string>\t Set the string(s) that an url must not include (use \\n as a delimiter between multiple strings)\n";
}
printStandardReport('quit',$command_line);
if ($email_log) {
$indexed = ($all==1) ? 'ALL' : $url;
$log_report = "";
if ($log_handle) {
$log_report = "Log saved into $log_file";
}
mail($admin_email, "Sphider indexing report", "Sphider has finished indexing $indexed at ".date("y-m-d H:i:s").". ".$log_report);
}
if ( $log_handle) {
fclose($log_handle);
}
?>
rastrearfuncs.php
<?php
function getFileContents($url) {
global $user_agent;
$urlparts = parse_url($url);
$path = $urlparts['path'];
$host = $urlparts['host'];
if ($urlparts['query'] != "")
$path .= "?".$urlparts['query'];
if (isset ($urlparts['port'])) {
$port = (int) $urlparts['port'];
} else
if ($urlparts['scheme'] == "http") {
$port = 80;
} else
if ($urlparts['scheme'] == "https") {
$port = 443;
}
if ($port == 80) {
$portq = "";
} else {
$portq = ":$port";
}
$all = "*/*";
$request = "GET $path HTTP/1.0\r\nHost: $host$portq\r\nAccept: $all\r\nUser-Agent: $user_agent\r\n\r\n";
$fsocket_timeout = 30;
if (substr($url, 0, 5) == "https") {
$target = "ssl://".$host;
} else {
$target = $host;
}
$errno = 0;
$errstr = "";
$fp = @ fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
print $errstr;
if (!$fp) {
$contents['state'] = "NOHOST";
printConnectErrorReport($errstr);
return $contents;
} else {
if (!fputs($fp, $request)) {
$contents['state'] = "Cannot send request";
return $contents;
}
$data = null;
socket_set_timeout($fp, $fsocket_timeout);
do{
$status = socket_get_status($fp);
$data .= fgets($fp, 8192);
} while (!feof($fp) && !$status['timed_out']) ;
fclose($fp);
if ($status['timed_out'] == 1) {
$contents['state'] = "timeout";
} else
$contents['state'] = "ok";
$contents['file'] = substr($data, strpos($data, "\r\n\r\n") + 4);
}
return $contents;
}
/*
check if file is available and in readable form
*/
function url_status($url) {
global $user_agent, $index_pdf, $index_doc, $index_xls, $index_ppt;
$urlparts = parse_url($url);
$path = $urlparts['path'];
$host = $urlparts['host'];
if (isset($urlparts['query']))
$path .= "?".$urlparts['query'];
if (isset ($urlparts['port'])) {
$port = (int) $urlparts['port'];
} else
if ($urlparts['scheme'] == "http") {
$port = 80;
} else
if ($urlparts['scheme'] == "https") {
$port = 443;
}
if ($port == 80) {
$portq = "";
} else {
$portq = ":$port";
}
$all = "*/*"; //just to prevent "comment effect" in get accept
$request = "HEAD $path HTTP/1.1\r\nHost: $host$portq\r\nAccept: $all\r\nUser-Agent: $user_agent\r\n\r\n";
if (substr($url, 0, 5) == "https") {
$target = "ssl://".$host;
} else {
$target = $host;
}
$fsocket_timeout = 30;
$errno = 0;
$errstr = "";
$fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
print $errstr;
$linkstate = "ok";
if (!$fp) {
$status['state'] = "NOHOST";
} else {
socket_set_timeout($fp, 30);
fputs($fp, $request);
$answer = fgets($fp, 4096);
$regs = Array ();
if (preg_match("/HTTP/[0-9.]+ (([0-9])[0-9]{2})/", $answer, $regs)) {
$httpcode = $regs[2];
$full_httpcode = $regs[1];
if ($httpcode <> 2 && $httpcode <> 3) {
$status['state'] = "Unreachable: http $full_httpcode";
$linkstate = "Unreachable";
}
}
if ($linkstate <> "Unreachable") {
while ($answer) {
$answer = fgets($fp, 4096);
if (preg_match("/Location: *([^\n\r ]+)/", $answer, $regs) && $httpcode == 3 && $full_httpcode != 302) {
$status['path'] = $regs[1];
$status['state'] = "Relocation: http $full_httpcode";
fclose($fp);
return $status;
}
if (preg_match("/Last-Modified: *([a-z0-9,: ]+)/i", $answer, $regs)) {
$status['date'] = $regs[1];
}
if (preg_match("/Content-Type:/i", $answer)) {
$content = $answer;
$answer = '';
break;
}
}
$socket_status = socket_get_status($fp);
if (preg_match("/Content-Type: *([a-z\/.-]*)/i", $content, $regs)) {
if ($regs[1] == 'text/html' || $regs[1] == 'text/' || $regs[1] == 'text/plain') {
$status['content'] = 'text';
$status['state'] = 'ok';
} else if ($regs[1] == 'application/pdf' && $index_pdf == 1) {
$status['content'] = 'pdf';
$status['state'] = 'ok';
} else if (($regs[1] == 'application/msword' || $regs[1] == 'application/vnd.ms-word') && $index_doc == 1) {
$status['content'] = 'doc';
$status['state'] = 'ok';
} else if (($regs[1] == 'application/excel' || $regs[1] == 'application/vnd.ms-excel') && $index_xls == 1) {
$status['content'] = 'xls';
$status['state'] = 'ok';
} else if (($regs[1] == 'application/mspowerpoint' || $regs[1] == 'application/vnd.ms-powerpoint') && $index_ppt == 1) {
$status['content'] = 'ppt';
$status['state'] = 'ok';
} else {
$status['state'] = "Not text or html";
}
} else
if ($socket_status['timed_out'] == 1) {
$status['state'] = "Timed out (no reply from server)";
} else
$status['state'] = "Not text or html";
}
}
fclose($fp);
return $status;
}
/*
Read robots.txt file in the server, to find any disallowed files/folders
*/
function check_robot_txt($url) {
global $user_agent;
$urlparts = parse_url($url);
$url = 'http://'.$urlparts['host']."/robots.txt";
$url_status = url_status($url);
$omit = array ();
if ($url_status['state'] == "ok") {
$robot = file($url);
if (!$robot) {
$contents = getFileContents($url);
$file = $contents['file'];
$robot = explode("\n", $file);
}
$regs = Array ();
$this_agent= "";
while (list ($id, $line) = each($robot)) {
if (preg_match("/^user-agent: *([^#]+) */", $line, $regs)) {
$this_agent = trim($regs[1]);
if ($this_agent == '*' || $this_agent == $user_agent)
$check = 1;
else
$check = 0;
}
if (preg_match("/disallow: *([^#]+)/", $line, $regs) && $check == 1) {
$disallow_str = preg_replace("/[\n ]+/i", "", $regs[1]);
if (trim($disallow_str) != "") {
$omit[] = $disallow_str;
} else {
if ($this_agent == '*' || $this_agent == $user_agent) {
return null;
}
}
}
}
}
return $omit;
}
/*
Remove the file part from an url (to build an url from an url and given relative path)
*/
function remove_file_from_url($url) {
$url_parts = parse_url($url);
$path = $url_parts['path'];
$regs = Array ();
if (preg_match('/([^\/]+)$/i', $path, $regs)) {
$file = $regs[1];
$check = $file.'$';
$path = preg_replace("/$check"."/i", "", $path);
}
if ($url_parts['port'] == 80 || $url_parts['port'] == "") {
$portq = "";
} else {
$portq = ":".$url_parts['port'];
}
$url = $url_parts['scheme']."://".$url_parts['host'].$portq.$path;
return $url;
}
/*
Extract links from html
*/
function get_links($file, $url, $can_leave_domain, $base) {
$chunklist = array ();
// The base URL comes from either the meta tag or the current URL.
if (!empty($base)) {
$url = $base;
}
$links = array ();
$regs = Array ();
$checked_urls = Array();
preg_match_all("/href\s*=\s*[\'\"]?([+:%\/\?~=&;\\\(\),._a-zA-Z0-9-]*)(#[.a-zA-Z0-9-]*)?[\'\" ]?(\s*rel\s*=\s*[\'\"]?(nofollow)[\'\"]?)?/i", $file, $regs, PREG_SET_ORDER);
foreach ($regs as $val) {
if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
$links[] = $a;
}
$checked_urls[$val[1]] = 1;
}
}
preg_match_all("/(frame[^>]*src[[:blank:]]*)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
foreach ($regs as $val) {
if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
$links[] = $a;
}
$checked_urls[$val[1]] = 1;
}
}
preg_match_all("/(window[.]location)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
foreach ($regs as $val) {
if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
$links[] = $a;
}
$checked_urls[$val[1]] = 1;
}
}
preg_match_all("/(http-equiv=['\"]refresh['\"] *content=['\"][0-9]+;url)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
foreach ($regs as $val) {
if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
$links[] = $a;
}
$checked_urls[$val[1]] = 1;
}
}
preg_match_all("/(window[.]open[[:blank:]]*[(])[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
foreach ($regs as $val) {
if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
if (($a = url_purify($val[1], $url, $can_leave_domain)) != '') {
$links[] = $a;
}
$checked_urls[$val[1]] = 1;
}
}
return $links;
}
/*
Function to build a unique word array from the text of a webpage, together with the count of each word
*/
function unique_array($arr) {
global $min_word_length;
global $common;
global $word_upper_bound;
global $index_numbers, $stem_words;
if ($stem_words == 1) {
$newarr = Array();
foreach ($arr as $val) {
$newarr[] = stem($val);
}
$arr = $newarr;
}
sort($arr);
reset($arr);
$newarr = array ();
$i = 0;
$counter = 1;
$element = current($arr);
if ($index_numbers == 1) {
$pattern = "/[a-z0-9]+/";
} else {
$pattern = "/[a-z]+/";
}
$regs = Array ();
for ($n = 0; $n < sizeof($arr); $n ++) {
//check if word is long enough, contains alphabetic characters and is not a common word
//to eliminate/count multiple instance of words
$next_in_arr = next($arr);
if ($next_in_arr != $element) {
if (strlen($element) >= $min_word_length && preg_match($pattern, remove_accents($element)) && (@ $common[$element] <> 1)) {
if (preg_match("/^(-|\\\')(.*)/", $element, $regs))
$element = $regs[2];
if (preg_match("/(.*)(\\\'|-)$/", $element, $regs))
$element = $regs[1];
$newarr[$i][1] = $element;
$newarr[$i][2] = $counter;
$element = current($arr);
$i ++;
$counter = 1;
} else {
$element = $next_in_arr;
}
} else {
if ($counter < $word_upper_bound)
$counter ++;
}
}
return $newarr;
}
/*
Checks if url is legal, relative to the main url.
*/
function url_purify($url, $parent_url, $can_leave_domain) {
global $ext, $mainurl, $apache_indexes, $strip_sessids;
$urlparts = parse_url($url);
$main_url_parts = parse_url($mainurl);
if ($urlparts['host'] != "" && $urlparts['host'] != $main_url_parts['host'] && $can_leave_domain != 1) {
return '';
}
reset($ext);
while (list ($id, $excl) = each($ext))
if (preg_match("/\.$excl$/i", $url))
return '';
if (substr($url, -1) == '\\') {
return '';
}
if (isset($urlparts['query'])) {
if ($apache_indexes[$urlparts['query']]) {
return '';
}
}
if (preg_match("/[\/]?mailto:|[\/]?javascript:|[\/]?news:/i", $url)) {
return '';
}
if (isset($urlparts['scheme'])) {
$scheme = $urlparts['scheme'];
} else {
$scheme ="";
}
//only http and https links are followed
if (!($scheme == 'http' || $scheme == '' || $scheme == 'https')) {
return '';
}
//parent url might be used to build an url from relative path
$parent_url = remove_file_from_url($parent_url);
$parent_url_parts = parse_url($parent_url);
if (substr($url, 0, 1) == '/') {
$url = $parent_url_parts['scheme']."://".$parent_url_parts['host'].$url;
} else
if (!isset($urlparts['scheme'])) {
$url = $parent_url.$url;
}
$url_parts = parse_url($url);
$urlpath = $url_parts['path'];
$regs = Array ();
while (preg_match("/[^\/]*\/[.]{2}\//", $urlpath, $regs)) {
$urlpath = str_replace($regs[0], "", $urlpath);
}
//remove relative path instructions like ../ etc
$urlpath = preg_replace("/\/+/", "/", $urlpath);
$urlpath = preg_replace("/[^\/]*\/[.]{2}/", "", $urlpath);
$urlpath = str_replace("./", "", $urlpath);
$query = "";
if (isset($url_parts['query'])) {
$query = "?".$url_parts['query'];
}
if ($main_url_parts['port'] == 80 || $url_parts['port'] == "") {
$portq = "";
} else {
$portq = ":".$main_url_parts['port'];
}
$url = $url_parts['scheme']."://".$url_parts['host'].$portq.$urlpath.$query;
//if we index sub-domains
if ($can_leave_domain == 1) {
return $url;
}
$mainurl = remove_file_from_url($mainurl);
if ($strip_sessids == 1) {
$url = remove_sessid($url);
}
//only urls in staying in the starting domain/directory are followed
$url = convert_url($url);
if (strstr($url, $mainurl) == false) {
return '';
} else
return $url;
}
function save_keywords($wordarray, $link_id, $domain) {
global $mysql_table_prefix, $all_keywords;
reset($wordarray);
while ($thisword = each($wordarray)) {
$word = $thisword[1][1];
$wordmd5 = substr(md5($word), 0, 1);
$weight = $thisword[1][2];
if (strlen($word)<= 30) {
$keyword_id = $all_keywords[$word];
if ($keyword_id == "") {
mysql_query("insert into ".$mysql_table_prefix."keywords (keyword) values ('$word')");
if (mysql_errno() == 1062) {
$result = mysql_query("select keyword_ID from ".$mysql_table_prefix."keywords where keyword='$word'");
echo mysql_error();
$row = mysql_fetch_row($result);
$keyword_id = $row[0];
} else{
$keyword_id = mysql_insert_id();
$all_keywords[$word] = $keyword_id;
echo mysql_error();
}
}
$inserts[$wordmd5] .= ",($link_id, $keyword_id, $weight, $domain)";
}
}
for ($i=0;$i<=15; $i++) {
$char = dechex($i);
$values= substr($inserts[$char], 1);
if ($values!="") {
$query = "insert into ".$mysql_table_prefix."link_keyword$char (link_id, keyword_id, weight, domain) values $values";
mysql_query($query);
echo mysql_error();
}
}
}
function get_head_data($file) {
$headdata = "";
preg_match("@<head[^>]*>(.*?)<\/head>@si",$file, $regs);
$headdata = $regs[1];
$description = "";
$robots = "";
$keywords = "";
$base = "";
$res = Array ();
if ($headdata != "") {
preg_match("/<meta +name *=[\"']?robots[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
if (isset ($res)) {
$robots = $res[1];
}
preg_match("/<meta +name *=[\"']?description[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
if (isset ($res)) {
$description = $res[1];
}
preg_match("/<meta +name *=[\"']?keywords[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
if (isset ($res)) {
$keywords = $res[1];
}
// e.g. <base href="http://www.consil.co.uk/index.php" />
preg_match("/<base +href *= *[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
if (isset ($res)) {
$base = $res[1];
}
$keywords = preg_replace("/[, ]+/", " ", $keywords);
$robots = explode(",", strtolower($robots));
$nofollow = 0;
$noindex = 0;
foreach ($robots as $x) {
if (trim($x) == "noindex") {
$noindex = 1;
}
if (trim($x) == "nofollow") {
$nofollow = 1;
}
}
$data['description'] = addslashes($description);
$data['keywords'] = addslashes($keywords);
$data['nofollow'] = $nofollow;
$data['noindex'] = $noindex;
$data['base'] = $base;
}
return $data;
}
function clean_file($file, $url, $type) {
global $entities, $index_host, $index_meta_keywords;
$urlparts = parse_url($url);
$host = $urlparts['host'];
//remove filename from path
$path = preg_replace('/([^\/]+)$/i', "", $urlparts['path']);
$file = preg_replace("/<link rel[^<>]*>/i", " ", $file);
$file = preg_replace("@<!--sphider_noindex-->.*?<!--\/sphider_noindex-->@si", " ",$file);
$file = preg_replace("@<!--.*?-->@si", " ",$file);
$file = preg_replace("@<script[^>]*?>.*?</script>@si", " ",$file);
$headdata = get_head_data($file);
$regs = Array ();
if (preg_match("@<title *>(.*?)<\/title*>@si", $file, $regs)) {
$title = trim($regs[1]);
$file = str_replace($regs[0], "", $file);
} else if ($type == 'pdf' || $type == 'doc') { //the title of a non-html file is its first few words
$title = substr($file, 0, strrpos(substr($file, 0, 40), " "));
}
$file = preg_replace("@<style[^>]*>.*?<\/style>@si", " ", $file);
//create spaces between tags, so that removing tags doesnt concatenate strings
$file = preg_replace("/<[\w ]+>/", "\\0 ", $file);
$file = preg_replace("/<\/[\w ]+>/", "\\0 ", $file);
$file = strip_tags($file);
$file = preg_replace("/ /", " ", $file);
$fulltext = $file;
$file .= " ".$title;
if ($index_host == 1) {
$file = $file." ".$host." ".$path;
}
if ($index_meta_keywords == 1) {
$file = $file." ".$headdata['keywords'];
}
//replace codes with ascii chars
$file = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $file);
$file = preg_replace('~&#([0-9]+);~e', 'chr("\\1")', $file);
$file = strtolower($file);
reset($entities);
while ($char = each($entities)) {
$file = preg_replace("/".$char[0]."/i", $char[1], $file);
}
$file = preg_replace("/&[a-z]{1,6};/", " ", $file);
$file = preg_replace("/[\*\^\+\?\\\.\[\]\^\$\|\{\)\(\}~!\"\/@#$%&=`;><:,]+/", " ", $file);
$file = preg_replace("/\s+/", " ", $file);
$data['fulltext'] = addslashes($fulltext);
$data['content'] = addslashes($file);
$data['title'] = addslashes($title);
$data['description'] = $headdata['description'];
$data['keywords'] = $headdata['keywords'];
$data['host'] = $host;
$data['path'] = $path;
$data['nofollow'] = $headdata['nofollow'];
$data['noindex'] = $headdata['noindex'];
$data['base'] = $headdata['base'];
return $data;
}
function calc_weights($wordarray, $title, $host, $path, $keywords) {
global $index_host, $index_meta_keywords;
$hostarray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($host))));
$patharray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($path))));
$titlearray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($title))));
$keywordsarray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($keywords))));
$path_depth = countSubstrs($path, "/");
while (list ($wid, $word) = each($wordarray)) {
$word_in_path = 0;
$word_in_domain = 0;
$word_in_title = 0;
$meta_keyword = 0;
if ($index_host == 1) {
while (list ($id, $path) = each($patharray)) {
if ($path[1] == $word[1]) {
$word_in_path = 1;
break;
}
}
reset($patharray);
while (list ($id, $host) = each($hostarray)) {
if ($host[1] == $word[1]) {
$word_in_domain = 1;
break;
}
}
reset($hostarray);
}
if ($index_meta_keywords == 1) {
while (list ($id, $keyword) = each($keywordsarray)) {
if ($keyword[1] == $word[1]) {
$meta_keyword = 1;
break;
}
}
reset($keywordsarray);
}
while (list ($id, $tit) = each($titlearray)) {
if ($tit[1] == $word[1]) {
$word_in_title = 1;
break;
}
}
reset($titlearray);
$wordarray[$wid][2] = (int) (calc_weight($wordarray[$wid][2], $word_in_title, $word_in_domain, $word_in_path, $path_depth, $meta_keyword));
}
reset($wordarray);
return $wordarray;
}
function isDuplicateMD5($md5sum) {
global $mysql_table_prefix;
$result = mysql_query("select link_id from ".$mysql_table_prefix."links where md5sum='$md5sum'");
echo mysql_error();
if (mysql_num_rows($result) > 0) {
return true;
}
return false;
}
function check_include($link, $inc, $not_inc) {
$url_inc = Array ();
$url_not_inc = Array ();
if ($inc != "") {
$url_inc = explode("\n", $inc);
}
if ($not_inc != "") {
$url_not_inc = explode("\n", $not_inc);
}
$oklinks = Array ();
$include = true;
foreach ($url_not_inc as $str) {
$str = trim($str);
if ($str != "") {
if (substr($str, 0, 1) == '*') {
if (preg_match(substr($str, 1), $link)) {
$include = false;
break;
}
} else {
if (!(strpos($link, $str) === false)) {
$include = false;
break;
}
}
}
}
if ($include && $inc != "") {
$include = false;
foreach ($url_inc as $str) {
$str = trim($str);
if ($str != "") {
if (substr($str, 0, 1) == '*') {
if (preg_match(substr($str, 1), $link)) {
$include = true;
break 2;
}
} else {
if (strpos($link, $str) !== false) {
$include = true;
break;
}
}
}
}
}
return $include;
}
function check_for_removal($url) {
global $mysql_table_prefix;
global $command_line;
$result = mysql_query("select link_id, visible from ".$mysql_table_prefix."links"." where url='$url'");
echo mysql_error();
if (mysql_num_rows($result) > 0) {
$row = mysql_fetch_row($result);
$link_id = $row[0];
$visible = $row[1];
if ($visible > 0) {
$visible --;
mysql_query("update ".$mysql_table_prefix."links set visible=$visible where link_id=$link_id");
echo mysql_error();
} else {
mysql_query("delete from ".$mysql_table_prefix."links where link_id=$link_id");
echo mysql_error();
for ($i=0;$i<=15; $i++) {
$char = dechex($i);
mysql_query("delete from ".$mysql_table_prefix."link_keyword$char where link_id=$link_id");
echo mysql_error();
}
printStandardReport('pageRemoved',$command_line);
}
}
}
function convert_url($url) {
$url = str_replace("&", "&", $url);
$url = str_replace(" ", "%20", $url);
return $url;
}
function extract_text($contents, $source_type) {
global $tmp_dir, $pdftotext_path, $catdoc_path, $xls2csv_path, $catppt_path;
$temp_file = "tmp_file";
$filename = $tmp_dir."/".$temp_file ;
if (!$handle = fopen($filename, 'w')) {
die ("Cannot open file $filename");
}
if (fwrite($handle, $contents) === FALSE) {
die ("Cannot write to file $filename");
}
fclose($handle);
if ($source_type == 'pdf') {
$command = $pdftotext_path." $filename -";
$a = exec($command,$result, $retval);
} else if ($source_type == 'doc') {
$command = $catdoc_path." $filename";
$a = exec($command,$result, $retval);
} else if ($source_type == 'xls') {
$command = $xls2csv_path." $filename";
$a = exec($command,$result, $retval);
} else if ($source_type == 'ppt') {
$command = $catppt_path." $filename";
$a = exec($command,$result, $retval);
}
unlink ($filename);
return implode(' ', $result);
}
//function to calculate the weight of pages
function calc_weight ($words_in_page, $word_in_title, $word_in_domain, $word_in_path, $path_depth, $meta_keyword) {
global $title_weight, $domain_weight, $path_weight,$meta_weight;
$weight = ($words_in_page + $word_in_title * $title_weight +
$word_in_domain * $domain_weight +
$word_in_path * $path_weight + $meta_keyword * $meta_weight) *10 / (0.8 +0.2*$path_depth);
return $weight;
}
function remove_sessid($url) {
return preg_replace("/(\?|&)(PHPSESSID|JSESSIONID|ASPSESSIONID|sid)=[0-9a-zA-Z]+$/", "", $url);
}
function sanitizar_utf8($texto) {
$saida = '';
$i = 0;
$len = strlen($texto);
while ($i < $len) {
$char = $texto[$i++];
$ord = ord($char);
// Primeiro byte 0xxxxxxx: simbolo ascii possui 1 byte
if (($ord & 0x80) == 0x00) {
// Se e' um caractere de controle
if (($ord >= 0 && $ord <= 31) || $ord == 127) {
// Incluir se for: tab, retorno de carro ou quebra de linha
if ($ord == 9 || $ord == 10 || $ord == 13) {
$saida .= $char;
}
// Simbolo ASCII
} else {
$saida .= $char;
}
// Primeiro byte 110xxxxx ou 1110xxxx ou 11110xxx: simbolo possui 2, 3 ou 4 bytes
} else {
// Determinar quantidade de bytes analisando os bits da esquerda para direita
$bytes = 0;
for ($b = 7; $b >= 0; $b--) {
$bit = $ord & (1 << $b);
if ($bit) {
$bytes += 1;
} else {
break;
}
}
switch ($bytes) {
case 2: // 110xxxxx 10xxxxxx
case 3: // 1110xxxx 10xxxxxx 10xxxxxx
case 4: // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
$valido = true;
$saida_padrao = $char;
$i_inicial = $i;
for ($b = 1; $b < $bytes; $b++) {
if (!isset($texto[$i])) {
$valido = false;
break;
}
$char_extra = $texto[$i++];
$ord_extra = ord($char_extra);
if (($ord_extra & 0xC0) == 0x80) {
$saida_padrao .= $char_extra;
} else {
$valido = false;
break;
}
}
if ($valido) {
$saida .= $saida_padrao;
} else {
$saida .= ($ord < 0x7F || $ord > 0x9F) ? utf8_encode($char) : '';
$i = $i_inicial;
}
break;
case 1: // 10xxxxxx: ISO-8859-1
default: // 11111xxx: ISO-8859-1
$saida .= ($ord < 0x7F || $ord > 0x9F) ? utf8_encode($char) : '';
break;
}
}
}
return $saida;
}
?>
Alguém pode me ajudar a implementa-lo ou me mostrar onde está o erro?



Postagens
Male
