PDA

Просмотр полной версии : Google_Grabber


KPOT_f!nd
26.02.2008, 02:42
Сабж. нужне любой (php/perl/питон и т.д) Google_Grabber... Самому нет возможности писать, работа не позволят ( времени хавает много...

Isis
26.02.2008, 02:53
Google Parser by lamarez
IcQ:123424
Site:k0x.ru

Use: php gp.php -q "your query" [-title] [-page ...]

-help This help:).
-q Your query.
-page Page number.
-num count of results.
-title Show page title.

Search pages with word "lamarez"
Example: php gp.php -q "lamarez"
Show 100 links of sites with word "lamarez" and their titles.
Example: php gp.php -q "lamarez" -num 100 -title

<?
//error_reporting(0);
function GetSome($domain, $path)
{
$hostname = gethostbyname($domain);
$responce = "";
$errnum=0;$errstr="";
$fsock = fsockopen($hostname,80,$errnum,$errstr,5);
if(!$fsock)
{
return 0;
}
$headers = "GET $path HTTP/1.1\n";
$headers .= "Host: $domain\n";
$headers .= "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.8.0.2) Gecko/20060308 Firefox/1.5.0.2\n";
$headers .= "Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5\n";
$headers .= "Accept-Language: ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3\n";
$headers .= "Accept-Charset: windows-1251,utf-8;q=0.7,*;q=0.7\n";
$headers .= "Keep-Alive: 500\r\n\r\n";
fwrite ($fsock,$headers);
while (!feof($fsock))
{
$responce .= fread($fsock,1024);
}
fclose ($fsock);
return $responce;
}
function GetCLI($str)
{
echo iconv("WINDOWS-1251","CP866",$str);
$line = trim(fgets(STDIN));
return $line;
}
if(!isset($argv))
{
die("Work only in console. Use: php gp.php");
}
if(in_array('-help',$argv))
{
$helptext = "Google Parser by lamarez\n"
."IcQ:123424\n"
."Site:k0x.ru\n\n"
."Use: php gp.php -q \"your query\" [-title] [-page ...]\n\n"
."-help This help:).\n"
."-q Your query.\n"
."-page Page number.\n"
."-num count of results.\n"
."-title Show page title.\n\n"
."Search pages with word \"lamarez\"\n"
."\tExample: php gp.php -q \"lamarez\"\n"
."Show 100 links of sites with word \"lamarez\" and their titles.\n"
."\tExample: php gp.php -q \"lamarez\" -num 100 -title\n";
die($helptext);
}
////////////////////////////////////////////////////////////
if(in_array('-page',$argv))
{
$page = $argv[array_search('-page',$argv)+1];
}
else
{
$page = 0;
}
////////////////////////////////////////////////////////////
if(in_array('-num',$argv))
{
$num = $argv[array_search('-num',$argv)+1];
}
else
{
$num = 10;
}
////////////////////////////////////////////////////////////
if(in_array('-q',$argv))
{
$query = $argv[array_search('-q',$argv)+1];
}
else
{
$query = GetCLI('Введите строку для поиска:');
}
$title = (in_array("-title",$argv))?true:false;
$cool = (in_array("-beautiful",$argv))?true:false;

////////////////////////////////////////////////////////////
$query=urlencode($query);
//$googletext=GetSome('www.google.com',"http://www.google.com/search?q=$query&num=$num&hl=ru&client=opera&rls=ru&start=".($page*$num)."&sa=N");
$googletext=file_get_contents("http://www.google.com/search?q=$query&num=$num&hl=ru&client=opera&rls=ru&start=".($page*$num)."&sa=N");
$googletext=str_replace('<b>','',$googletext);
$googletext=str_replace('</b>','',$googletext);
preg_match_all('#<a href="(\S+)" class=l>([^<]*)</a>#i',$googletext,$zret);
for($i=0;$i<count($zret[1]);$i++)
{
if($cool){echo "--------------------------------------------------------------------------------\n";}
if($title)
{
$titletext = htmlspecialchars_decode($zret[2][$i]);
$titletext = iconv("WINDOWS-1251","CP866",$titletext);
echo "\t".$titletext."\n";
}
echo $zret[1][$i]."\n";
if($cool){echo "--------------------------------------------------------------------------------\n";}
}
?>

можна просто php script.php а затем через stdin ввести что нужно искать...
а можна php script.php -num 100 -title -page 2 и опятьже ввести через stdin...
© lamarez gay

Хозяин
26.02.2008, 03:23
KPOT_f!nd, выкладывал, Aura, лучшее в своем роде
https://forum.antichat.ru/thread61689-Aura.html

KPOT_f!nd
26.02.2008, 03:37
Огромное спасибо тему можно локнуть...

Хозяин
26.02.2008, 03:41
А если что-то простое нужно, то вот
<?php
Header("Content-Type: text/html; charset=windows-1251");
echo '<style>input {width:220px;}</style>';
echo '<form action="" method="post">';
echo '<b>Запросы:</b><textarea rows=15 cols=72 name=str></textarea><br>';
echo '<b>кол-во URL на запрос:</b><input type=text name=count value="10"><br><br>';
echo '
<input type=submit value="Парсить"> <br>
</form>';
if (!isset($_POST['str'])) die();
set_time_limit(0);
ob_implicit_flush();
$start=0;
$urls = "";
$numpa=$_POST['count'];
$result=trim($numpa);
$querys=$_POST['str'];
$querys=explode("\n",trim($querys));
$links=fopen('./result.txt','w');
foreach($querys as $query) {
$query=urlencode($query);
//print $query."<br>";
$url='http://www.google.com/ie?q='.$query.'&num='.$result.'&hl=en&lr=&c2coff=1&start=0&sa=N';
$str=get_page($url);
fwrite($links,$str);
}
fclose($links);
$query = ParseUrls();
$exclude = array("google", "doc", "pdf", "rtf", "xls", "jsp", "swf");
for ($i=0; $i<count($query); $i++) {
for ($n=0; $n<count($exclude); $n++) {
if (stristr($query[$i],$exclude[$n])) $query[$i]="EMPTY";
}
}
$pieces_temp = array_unique($query);
$query = array_values($pieces_temp);
$data=fopen('./data.txt','w');
$urls=fopen('./urls.txt','w');
print "<ol>";
for ($k=0; $k<count($query); $k++) {
$link = trim($query[$k]);
if ($link!="EMPTY") {
print "<li>".$link.chr(13).chr(10)."<br>";
fwrite($urls,$link."\r\n");
$content=get_page($link);
$text = CleanText($content);
fwrite($data, $text);
}
}
print "</ol>";
fclose($data);
fclose($urls);
print "Done!!! <a href=./data.txt target=_blank>Parsed text here</a>, <a href=./urls.txt target=_blank>parsed URLs here</a>...";
//------------------------------------------------
function CleanText($content) //функция обработки страницы
{
$text = $content;
$text = preg_replace("/<title>\s*(.*?)\s*<\/title>/is"," ",$text);
$text = preg_replace("/<!--.*?-->/s"," ",$text);
$text = preg_replace("/<[Ss][Cc][Rr][Ii][Pp][Tt].*?<\/[Ss][Cc][Rr][Ii][Pp][Tt]>/s"," ",$text);
$text = preg_replace("/<[Ss][Tt][Yy][Ll][Ee].*?<\/[Ss][Tt][Yy][Ll][Ee]>/s"," ",$text);
$text = preg_replace("/<[^>]*>/s"," ",$text);
$style='/\<style[\w\W]*?\<\/style\>/i';
$script = '/\<script[\w\W]*?\<\/script\>/i';
$doc = '/\<!doctype[\w\W]*?\>/i';
$text = preg_replace($doc, ' ', $text);
$text = preg_replace($style, ' ', $text);
$text = eregi_replace(' style="[^">]*"', ' ', $text);
$text = strip_tags($text);
$text = preg_replace($script, ' ', $text);
$text = str_replace("&nbsp;", ' ', $text);
$text = preg_replace ("/[\s,]+/", ' ', $text);
$text = str_replace("...", ".", $text);
$text = str_replace("..", ".", $text);
$text = str_replace("!!!", "!", $text);
$text = str_replace("!!", "!", $text);
$text = str_replace("???", "?", $text);
$text = str_replace("??", "?", $text);
$text = str_replace('»', '"', $text);
$text = str_replace('«', '"', $text);
$text = str_replace(".", ".\r\n", $text);
$text = str_replace("!", ".\r\n", $text);
$text = str_replace("?", ".\r\n", $text);
$text = str_replace("|", ".\r\n", $text);
$text = str_replace(".\r\n.\r\n", ".\r\n", $text);
$text = str_replace(".\r\n.\r\n", ".\r\n", $text);
$text = str_replace(". \r\n", ".\r\n", $text);
$text = str_replace("\r\n\r\n", "\r\n", $text);
$text = str_replace("\r\n\r\n", "\r\n", $text);
$text = str_replace("\t\t", " ", $text);
$text = str_replace("\t", " ", $text);
$text = str_replace(" ", " ", $text);
$text = str_replace(" ", " ", $text);
$text = str_replace(" .", ".", $text);
$text = str_replace(" ,", ",", $text);
$text = str_replace("- - - ", "- ", $text);
$text = str_replace("- - ", "- ", $text);
$text = str_replace("---", "-", $text);
$text = str_replace("--", "-", $text);
$text = str_replace("--", "-", $text);
$text = str_replace(" ", " ", $text);
$text = str_replace(" ", " ", $text);
$text = str_replace("--", "-", $text);
$text = str_replace("--", "-", $text);
$text = str_replace("***", "*", $text);
$text = str_replace("**", "*", $text);
$text = str_replace("\r\n?", "\r\n", $text);
$text = str_replace("\r\n(", "\r\n", $text);
$text = str_replace("\r\n)", "\r\n", $text);
$text = str_replace("\r\n'", "\r\n", $text);
$text = str_replace("\r\n-", "\r\n", $text);
$text = str_replace("\r\n*", "\r\n", $text);
$text = str_replace("\r\n?", "\r\n", $text);
$text = str_replace("\r\n-", "\r\n", $text);
$text = str_replace("\r\n ", "\r\n", $text);
$text = str_replace("<", "", $text);
$text = str_replace(">", "", $text);
$string = explode("\r\n", $text); // разбиваем на предложения
$string_num = sizeof($string)-1;
//print $string_num.'<br>';
$newstring = array();
for($k=0; $k<$string_num; $k++)
{
$TempString = trim($string[$k]);
if (strlen($TempString)>210) {
$TempString = "";
} elseif (strlen($TempString)<55) {
$TempString = "";
} elseif (strpos($TempString, "[")!==false) {
$TempString = "";
} elseif (strpos($TempString, ">")!==false) {
$TempString = "";
} elseif (strpos($TempString, "http:")!==false) {
$TempString = "";
} elseif (strpos($TempString, "www")!==false) {
$TempString = "";
} elseif (strpos($TempString, "@")!==false) {
$TempString = "";
} elseif (strpos($TempString, "&copy")!==false) {
$TempString = "";
} elseif (strpos($TempString, "htm")!==false) {
$TempString = "";
} elseif (strpos($TempString, "#8250")!==false) {
$TempString = "";
} elseif (strpos($TempString, "#8249")!==false) {
$TempString = "";
} elseif (strpos($TempString, "#8482")!==false) {
$TempString = "";
} elseif (strpos($TempString, "&reg")!==false) {
$TempString = "";
} elseif (strpos($TempString, "_")!==false) {
$TempString = "";
} elseif (strpos($TempString, "<")!==false) {
$TempString = "";
} elseif (strpos($TempString, ">")!==false) {
$TempString = "";
} elseif (strpos($TempString, "&")!==false) {
$TempString = "";
} else {
$newstring[] = $TempString;
}
}
$string_total = sizeof($newstring)-1;
print 'Всего строк в тексте: '.$string_total.'<br>';
$text = implode("\r\n", $newstring);
return $text;
}
//------------------------------------------------
function get_page($host){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $host);
curl_setopt($ch, CURLOPT_POST, 0);
curl_setopt($ch, CURLOPT_COOKIE, 0);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_REFERER, $host);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)");
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_0);
$r = curl_exec($ch);
curl_close($ch);
return $r;
}
//------------------------------------------------
function ParseUrls()
{
$content = file_get_contents('./result.txt');
preg_match_all("|<a\s[^>]*?href\s*=\s*[\'\"]?(http://[^\s\'\">]+)[\s\'\"]?[^>]*?>(.+?)</a>|si", $content, $matches);
$link_numbers = sizeof($matches[1]);
for ($k=0; $k<$link_numbers; $k++)
{
$query[] = trim($matches[1][$k]);
}
return $query;
}
//------------------------------------------------
?>

Указываешь запросы, колличество урлов нужное для грабинга и все, результат в файл

KPOT_f!nd
26.02.2008, 03:52
А вот это намного лучше... А то подумал переписывать Isis'in скрипт : (

Isis
26.02.2008, 04:07
Не мой скрипт, а ламареза =\
Мой был бы с удобстваме

KPOT_f!nd
26.02.2008, 04:44
Мне все равно чей, самое главное его выложили... Тему локните модеры (

Spyder
26.02.2008, 13:51
Вот мой парсер ICQ.COM, принцип работы поисковика тот же что и у гугла

https://forum.antichat.ru/thread56927.html