Форум АНТИЧАТ - Показать сообщение отдельно

Цитата:

Сообщение от Шниперсон

↑
хотяб грабить урлы отсюда, раз в минуту
http://pastebin.com/archive

Вот питоновский вариант, он грабит линки раз в минуту и сохраняет все в текстовые файлы в указанный каталог.

Код:

# -*- coding: utf-8 -*-

import socks
import socket
import requests
import sys
import re
import os
import time

def custom_function(url, dir):
    dir = dir + '/'
    ext = 'txt'
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    try:
        r = requests.get(url, headers=headers, timeout=15)
    except Exception as e:
        print(e)
        return None
    try:
        html = r.text
    except Exception as e:
        print(e)
        return None
    #print(html)
    if url[-1] == '/':
        url = url[:-1]
    #print(url)
    pos = url.rfind('/')
    if ( pos == -1 ):
        print("Error getting file name from url")
        return None
    name = url[pos+1:]
    file_name = name + '.' + ext
    print('Saving to ' + file_name)
    try:
        if not os.path.exists(dir):
            os.mkdir(dir)
    except Exception as e:
        print(e)
        return None
      
    try:  
        f = open(dir + file_name, 'wb')
        f.write(html.encode("utf-8"))
        f.close()
    except Exception as e:
        print(e)
        return None
      
    return True
      
url = 'http://pastebin.com/archive'
raw_pre_url = 'http://pastebin.com/raw/'
dir = 'saved'
sleep_time = 60
#socks.set_default_proxy(socks.SOCKS5, "localhost", 9150)
#socket.socket = socks.socksocket
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

while True:
    try:
        r = requests.get(url, headers=headers, timeout=10)
    except Exception as e:
        print(e)
        print('Exiting')
        sys.exit()
    html = r.text
    #print(t)
    regex = re.compile('class="i_p0" alt="" />Untitled')
    matches = regex.findall(html)
    #print(matches)
    for match in matches:
        dest = raw_pre_url+match
        #print(dest)
        custom_function(dest, dir)
    print('Sleeping for ' + str(sleep_time) + ' seconds ...')
    time.sleep(sleep_time)

Нужно еще будет поставить requests и PySocks модули командой: pip install имя_модуля.