python爬虫程序 - Python - 技术文章

#coding=cp936
#$Id: ReceiveURL.py,v 1.4 2004/03/27 13:09:48 liyinghui Exp $

import sys
from sgmllib import SGMLParser
from os.path import normpath, normcase, join, split, splitext, isabs, exists

class ReceiveURL(SGMLParser):
    def reset(self):
        SGMLParser.reset(self)
        self.url=[]

    def start_a(self, attrs):
        for attr, value in attrs:
            if attr.lower()=="href":    #是链接，则变换链接，并加入锚点，同
                href=value

                if href.find("#")>0:
                    href=href[:href.index("#")]
                if href.find("?")>0:
                    href=href[:href.index("?")]
                file,ext=splitext(href)

                if self.fileexts:
                    if ext:
                        if ext in self.fileexts:    #是否有此后缀，如果有，则提出链接，做为下个文件处理
                            self.addURL(href)
                        else:
                            self.logger.info('Skip extension [%s]' % ext)
                else:
                    self.addURL(href)

    def start_link(self, attrs):
        for attr, value in attrs:
            if attr.lower()=="href":
                href=value
                self.addURL(href)

    def start_frame(self, attrs):
        for attr, value in attrs:
            if attr.lower()=="src":
                href=value
                self.addURL(href)

    def start_img(self, attrs):
        for attr, value in attrs:
            if attr.lower()=="src":
                href=value
                self.addURL(href)

    def addURL(self, url):
        if url not in self.url:
            self.url += [url]

    def output(self):
        print 'total=', len(self.url)
        for u in self.url:
            print '['+u+']'

    def run(self, filename, fileexts, htmlexts, logger):
        self.fileexts=fileexts #fileexts为可下载文件后缀列表
        self.logger = logger
        fname, ext = splitext(filename)
        if ext in htmlexts:
            text=open(filename).read()
            self.feed(text)
            self.close()

    def getURLs(self):
        return self.url

if __name__=='__main__':
    r=ReceiveURL()
    r.run(sys.argv[1])
    r.output()

--------------------------------------

#coding=cp936
#!/usr/bin/env python

"""CRAWL V2.0
copyright (c) limodou(chatme at 263.net)

This is a free software. It's destributed under the terms of GPL.
You can use it to grab HTML documents begin with specified url.
When you give a url, it'll first grab it, and parses all the links
in the page, then grab them all. Enjoy it!

$Id: CRAWL.PY,v 1.2 2004/03/25 05:28:33 liyinghui Exp $
"""

from sys        import argv
from os         import makedirs, unlink
from os.path    import isdir, exists, dirname, splitext
from string     import replace, find, lower
from htmllib    import HTMLParser
from urllib     import urlretrieve
from urlparse   import urlparse, urljoin
from formatter  import DumbWriter, AbstractFormatter
from cStringIO  import StringIO
import urllib2, getopt, sys, os, urllib
from ReceiveURL import ReceiveURL
import threading
import time
import traceback
import ConfigParser
import datetime

proxyflag = 0
seenfile = 'seen.txt'
downfile = 'down.txt'
inifile = '.crawl.ini'
logfile = 'crawl.log'
logger = None
tflag = False

class Retriever:                 # download Web pages

    def __init__(self, url, fileexts, htmlexts):
        self.url = url
        self.fileexts = fileexts
        self.htmlexts = htmlexts
        self.file = self.filename(url)
        self.r = ReceiveURL()

    def filename(self, url, deffile='index.html'):
        parsedurl = urlparse(url, 'http:', 0)  # parse path
        path = parsedurl[1] + parsedurl[2]
        ext = splitext(path)
        if ext[1] == '':
            if path[-1] == '/':
                path = path + deffile
            else:
                path = path + '/' + deffile
        dir = dirname(path)
        if not isdir(dir):       # create archive dir if nec.
            if exists(dir): unlink(dir)
            try:
                makedirs(dir)
            except:
                pass
        flag=0
        if parsedurl[3]:
            path += '_'+parsedurl[3]
            flag=1
        if parsedurl[4]:
            path += '_'+parsedurl[4]
            flag=1
        if flag:
            path += '.htm'
        return path

    def download(self):          # download Web page
        try:
#            retval = urllib.urlretrieve(self.url, self.file)
#add
            if proxyflag:
                f=urllib2.urlopen(self.url)
            else:
                f=urllib.urlopen(self.url)
            open(self.file, 'wb').write(f.read())
            retval=self.url, f.headers
#add end
        except Exception, e:
            logger.error(str(e))
            retval = ('*** ERROR: invalid URL "%s"' % self.url, )
        return retval

    def parseAndGetLinks(self):  # pars HTML, save links
        self.r.run(self.file, self.fileexts, self.htmlexts, logger)
        return self.r.getURLs()

class Crawler:                   # manage entire crawling process

    count = 0                    # static downloaded page counter

    def __init__(self, url, seen, exts, htmlexts):     #url是一个未下载的URL列表, seen是一个已经下载完毕的URL列表, exts是可下载文件名后缀
        self.q=url[:]
        self.seen = seen[:]
        self.exts = exts
        self.htmlexts = htmlexts
        self.lock = threading.Lock()
        parse=urlparse(url[0])
        self.dom = parse[1]
        self.basepath = parse[0]+'://'+parse[1]+dirname(parse[2])        # start path, everything inside the dir will be grabbed
        print 'Starting URL is: %s\n' % self.basepath

    def addDownLoadedURL(self, url):
        """加入已经下载完的url"""
        self.lock.acquire()
        self.seen.append(url)
        Crawler.count = Crawler.count + 1
        open(seenfile, "w").write("\n".join(self.seen))
        self.lock.release()

    def getDownloadURL(self):
        self.lock.acquire()
        if len(self.q) > 0:
            url = self.q[0]
            self.q.remove(url)
            open(downfile, "w").write("\n".join(self.q))
        else:
            url = ''
        self.lock.release()
        return url

    def getPage(self, url):
        self.addDownLoadedURL(url)
        r = Retriever(url, self.exts, self.htmlexts)
        if url.startswith(self.basepath):
            print threading.currentThread().getName(), 'GETING ~'+url[len(self.basepath):]
        else:
            print threading.currentThread().getName(), 'GETING '+url
        retval = r.download()
        if retval[0][0] == '*':     # error situation, do not parse
            print '    >>>> ERROR: skipping parse'
            return
        #print '\n(', Crawler.count, ')'
        #print 'URL:', url
        #print 'FILE:', retval[0]

        links = r.parseAndGetLinks()  # get and process links
        self.lock.acquire()
        for eachLink in links:
            if eachLink[:4] != 'http' and find(eachLink, '://') == -1:
                eachLink = urljoin(url, eachLink).split('#')[0]
                path = dirname(eachLink)
            else:
                path = dirname(eachLink)

            if find(lower(eachLink), 'mailto:') != -1:
                #print '... discarded, mailto link'
                continue

            if eachLink not in self.seen:
                if find(eachLink, self.dom) == -1 or not path.startswith(self.basepath):
                    #print '... discarded, not in domain' and path is not starting path
                    pass
                else:
                    if not eachLink in self.q:
                        self.q.append(eachLink)
                        #print '... new, added to Q'
                    else:
                        #print '... discarded, already in Q'
                        pass
            else:
                    #print '... discarded, already processed'
                    pass
        open(downfile, "w").write("\n".join(self.q))
        self.lock.release()

    def go(self, threadnum):                # process links in queue
        global tflag
        starttime = datetime.datetime.now()
        threads = []
        for i in range(threadnum):
            t = MyThread(self)
            t.setDaemon(True)
            threads.append(t)
        for i in range(threadnum):
            threads[i].start()
        while 1:
            try:
                if len(self.q) > 0:
                    time.sleep(0.1)
                    continue
                f = False
                for i in range(threadnum):
                    if threads[i].active:
                        f = True
                        break
                if f:
                    time.sleep(0.1)
                    continue
                break
#            url = self.q[0]
#            self.q.remove(url)
#            open(downfile, "w").write("\n".join(self.q))
#            self.getPage(url)
            except:
                traceback.print_exc()
                break
        tflag = True
        endtime = datetime.datetime.now()
        print "Retrieved total %d files in %d seconds." % (Crawler.count, (endtime - starttime).seconds)

class MyThread(threading.Thread):
    def __init__(self, robot):
        self.active = False
        self.robot = robot
        threading.Thread.__init__(self)

    def run(self):
        while 1 and not tflag:
            url = self.robot.getDownloadURL()
            if url:
                self.active = True
                self.robot.getPage(url)
                self.active = False
            else:
                time.sleep(0.1)

def initlog():
    import logging
    global logger

    logger = logging.getLogger()
    hdlr = logging.FileHandler(logfile)
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    hdlr.setFormatter(formatter)
    logger.addHandler(hdlr)
    logger.setLevel(logging.NOTSET)

    return logger

def usage():
    print '''
CRAWL V2.0
copyright (c) limodou(chatme at 263.net)

This is a free software. It's destributed under the terms of GPL.
You can use it to grab HTML documents begin with specified url.
When you give a url, it'll first grab it, and parses all the links
in the page, then grab them all. Enjoy it!

Command line usage:

        python crawl.py [-p proxy|-t num|-f configfile] [url|-r]
        -p proxy = http://[username:password@]hostname:port
        -t num thread num
        -f configfile default use .crawl.ini file

or      python crawl.py -u
'''

def main():
    global proxyflag
    global inifile
    try:
        opts, args = getopt.getopt(sys.argv[1:], "p:urt:f:", [])
    except getopt.GetoptError:
        usage()
        sys.exit(2)
    proxyhost=''
    resume=0
    seen=[]
    threadnum = 10
    for o, a in opts:
        if o == '-u':
            usage()
            sys.exit()
        elif o == '-p':
            proxyhost=a
            proxyflag=1
        elif o == '-r':
            url=[u.strip() for u in open(downfile).readlines()]
            seen=[u.strip() for u in open(seenfile).readlines() if u.strip()]
        elif o == '-t':
            try:
                threadnum = int(a)
            except:
                threadnum = 10
            if threadnum == 0:
                threadnum = 10
        elif o == '-f':
            inifile = a

    #args[0] = "http://localhost:8088/index.html"
    url=""
    if len(args) > 0:
        url = [args[0]]
    else:
        if not url:
            try:
                u = raw_input('Enter starting URL: ')
                url = [u]
            except (KeyboardInterrupt, EOFError):
                url = ''
    if proxyhost:
#        proxy=urllib2.ProxyHandler({'http':'http://www:www@11.133.232.19:8080'})
        print "\nProxy is: %s" % proxyhost
        proxy=urllib2.ProxyHandler({'http':proxyhost})
        opener=urllib2.build_opener(proxy)
        urllib2.install_opener(opener)

    if not url: return

    ini = ConfigParser.ConfigParser()
    ini.read(inifile)

    exts = []
    if ini.has_option('default', 'exts'):
        exts = ini.get('default', 'exts').split()
    if not exts:
        exts = ['.htm', '.html', '.gif', '.jpg', '.png', '.py', '.txt', '.css', '.js', '.aspx']

    htmlexts = []
    if ini.has_option('default', 'htmlexts'):
        htmlexts = ini.get('default', 'htmlexts').split()
    if not htmlexts:
        htmlexts = ['.htm', '.html']

    logger = initlog()

    robot = Crawler(url, seen, exts, htmlexts)
    robot.go(threadnum)

if __name__ == '__main__':
    main()

网友留言/评论

我要留言/评论