• python爬虫程序
    时间:2008-12-19   作者:limodo   出处:互联网

    转贴一个limodo写的一个爬虫。不多说了,自己看看程序吧。 一共2个文件:
    #coding=cp936
    #$Id: ReceiveURL.py,v 1.4 2004/03/27 13:09:48 liyinghui Exp $
    
    import sys
    from sgmllib import SGMLParser
    from os.path import normpath, normcase, join, split, splitext, isabs, exists
    
    class ReceiveURL(SGMLParser):
        def reset(self):
            SGMLParser.reset(self)
            self.url=[]
    
        def start_a(self, attrs):
            for attr, value in attrs:
                if attr.lower()=="href":    #是链接,则变换链接,并加入锚点,同
                    href=value
    
                    if href.find("#")>0:
                        href=href[:href.index("#")]
                    if href.find("?")>0:
                        href=href[:href.index("?")]
                    file,ext=splitext(href)
    
                    if self.fileexts:
                        if ext:
                            if ext in self.fileexts:    #是否有此后缀,如果有,则提出链接,做为下个文件处理
                                self.addURL(href)
                            else:
                                self.logger.info('Skip extension [%s]' % ext)
                    else:
                        self.addURL(href)
    
        def start_link(self, attrs):
            for attr, value in attrs:
                if attr.lower()=="href":
                    href=value
                    self.addURL(href)
    
        def start_frame(self, attrs):
            for attr, value in attrs:
                if attr.lower()=="src":
                    href=value
                    self.addURL(href)
    
        def start_img(self, attrs):
            for attr, value in attrs:
                if attr.lower()=="src":
                    href=value
                    self.addURL(href)
    
        def addURL(self, url):
            if url not in self.url:
                self.url += [url]
    
        def output(self):
            print 'total=', len(self.url)
            for u in self.url:
                print '['+u+']'
    
        def run(self, filename, fileexts, htmlexts, logger):
            self.fileexts=fileexts #fileexts为可下载文件后缀列表
            self.logger = logger
            fname, ext = splitext(filename)
            if ext in htmlexts:
                text=open(filename).read()
                self.feed(text)
                self.close()
    
        def getURLs(self):
            return self.url
    
    if __name__=='__main__':
        r=ReceiveURL()
        r.run(sys.argv[1])
        r.output()
    
    
    --------------------------------------
    
    #coding=cp936
    #!/usr/bin/env python
    
    """CRAWL V2.0
    copyright (c) limodou(chatme at 263.net)
    
    This is a free software. It's destributed under the terms of GPL.
    You can use it to grab HTML documents begin with specified url.
    When you give a url, it'll first grab it, and parses all the links
    in the page, then grab them all. Enjoy it!
    
    $Id: CRAWL.PY,v 1.2 2004/03/25 05:28:33 liyinghui Exp $
    """
    
    from sys        import argv
    from os         import makedirs, unlink
    from os.path    import isdir, exists, dirname, splitext
    from string     import replace, find, lower
    from htmllib    import HTMLParser
    from urllib     import urlretrieve
    from urlparse   import urlparse, urljoin
    from formatter  import DumbWriter, AbstractFormatter
    from cStringIO  import StringIO
    import urllib2, getopt, sys, os, urllib
    from ReceiveURL import ReceiveURL
    import threading
    import time
    import traceback
    import ConfigParser
    import datetime
    
    proxyflag = 0
    seenfile = 'seen.txt'
    downfile = 'down.txt'
    inifile = '.crawl.ini'
    logfile = 'crawl.log'
    logger = None
    tflag = False
    
    class Retriever:                 # download Web pages
    
        def __init__(self, url, fileexts, htmlexts):
            self.url = url
            self.fileexts = fileexts
            self.htmlexts = htmlexts
            self.file = self.filename(url)
            self.r = ReceiveURL()
    
        def filename(self, url, deffile='index.html'):
            parsedurl = urlparse(url, 'http:', 0)  # parse path
            path = parsedurl[1] + parsedurl[2]
            ext = splitext(path)
            if ext[1] == '':
                if path[-1] == '/':
                    path = path + deffile
                else:
                    path = path + '/' + deffile
            dir = dirname(path)
            if not isdir(dir):       # create archive dir if nec.
                if exists(dir): unlink(dir)
                try:
                    makedirs(dir)
                except:
                    pass
            flag=0
            if parsedurl[3]:
                path += '_'+parsedurl[3]
                flag=1
            if parsedurl[4]:
                path += '_'+parsedurl[4]
                flag=1
            if flag:
                path += '.htm'
            return path
    
        def download(self):          # download Web page
            try:
    #            retval = urllib.urlretrieve(self.url, self.file)
    #add
                if proxyflag:
                    f=urllib2.urlopen(self.url)
                else:
                    f=urllib.urlopen(self.url)
                open(self.file, 'wb').write(f.read())
                retval=self.url, f.headers
    #add end
            except Exception, e:
                logger.error(str(e))
                retval = ('*** ERROR: invalid URL "%s"' % self.url, )
            return retval
    
        def parseAndGetLinks(self):  # pars HTML, save links
            self.r.run(self.file, self.fileexts, self.htmlexts, logger)
            return self.r.getURLs()
    
    class Crawler:                   # manage entire crawling process
    
        count = 0                    # static downloaded page counter
    
        def __init__(self, url, seen, exts, htmlexts):     #url是一个未下载的URL列表, seen是一个已经下载完毕的URL列表, exts是可下载文件名后缀
            self.q=url[:]
            self.seen = seen[:]
            self.exts = exts
            self.htmlexts = htmlexts
            self.lock = threading.Lock()
            parse=urlparse(url[0])
            self.dom = parse[1]
            self.basepath = parse[0]+'://'+parse[1]+dirname(parse[2])        # start path, everything inside the dir will be grabbed
            print 'Starting URL is: %s\n' % self.basepath
    
        def addDownLoadedURL(self, url):
            """加入已经下载完的url"""
            self.lock.acquire()
            self.seen.append(url)
            Crawler.count = Crawler.count + 1
            open(seenfile, "w").write("\n".join(self.seen))
            self.lock.release()
    
        def getDownloadURL(self):
            self.lock.acquire()
            if len(self.q) > 0:
                url = self.q[0]
                self.q.remove(url)
                open(downfile, "w").write("\n".join(self.q))
            else:
                url = ''
            self.lock.release()
            return url
            
        def getPage(self, url):
            self.addDownLoadedURL(url)
            r = Retriever(url, self.exts, self.htmlexts)
            if url.startswith(self.basepath):
                print threading.currentThread().getName(), 'GETING ~'+url[len(self.basepath):]
            else:
                print threading.currentThread().getName(), 'GETING '+url
            retval = r.download()
            if retval[0][0] == '*':     # error situation, do not parse
                print '    >>>> ERROR: skipping parse'
                return
            #print '\n(', Crawler.count, ')'
            #print 'URL:', url
            #print 'FILE:', retval[0]
    
            links = r.parseAndGetLinks()  # get and process links
            self.lock.acquire()
            for eachLink in links:
                if eachLink[:4] != 'http' and find(eachLink, '://') == -1:
                    eachLink = urljoin(url, eachLink).split('#')[0]
                    path = dirname(eachLink)
                else:
                    path = dirname(eachLink)
    
                if find(lower(eachLink), 'mailto:') != -1:
                    #print '... discarded, mailto link'
                    continue
    
                if eachLink not in self.seen:
                    if find(eachLink, self.dom) == -1 or not path.startswith(self.basepath):
                        #print '... discarded, not in domain' and path is not starting path
                        pass
                    else:
                        if not eachLink in self.q:
                            self.q.append(eachLink)
                            #print '... new, added to Q'
                        else:
                            #print '... discarded, already in Q'
                            pass
                else:
                        #print '... discarded, already processed'
                        pass
            open(downfile, "w").write("\n".join(self.q))
            self.lock.release()
    
        def go(self, threadnum):                # process links in queue
            global tflag
            starttime = datetime.datetime.now()
            threads = []
            for i in range(threadnum):
                t = MyThread(self)
                t.setDaemon(True)
                threads.append(t)
            for i in range(threadnum):
                threads[i].start()
            while 1:
                try:
                    if len(self.q) > 0:
                        time.sleep(0.1)
                        continue
                    f = False
                    for i in range(threadnum):
                        if threads[i].active:
                            f = True
                            break
                    if f:
                        time.sleep(0.1)
                        continue
                    break
    #            url = self.q[0]
    #            self.q.remove(url)
    #            open(downfile, "w").write("\n".join(self.q))
    #            self.getPage(url)
                except:
                    traceback.print_exc()
                    break
            tflag = True
            endtime = datetime.datetime.now()
            print "Retrieved total %d files in %d seconds." % (Crawler.count, (endtime - starttime).seconds)
        
    class MyThread(threading.Thread):
        def __init__(self, robot):
            self.active = False
            self.robot = robot
            threading.Thread.__init__(self)
            
        def run(self):
            while 1 and not tflag:
                url = self.robot.getDownloadURL()
                if url:
                    self.active = True
                    self.robot.getPage(url)
                    self.active = False
                else:
                    time.sleep(0.1)
    
    def initlog():
        import logging
        global logger
        
        logger = logging.getLogger()
        hdlr = logging.FileHandler(logfile)
        formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
        hdlr.setFormatter(formatter)
        logger.addHandler(hdlr)
        logger.setLevel(logging.NOTSET)
        
        return logger
    
    def usage():
        print '''
    CRAWL V2.0
    copyright (c) limodou(chatme at 263.net)
    
    This is a free software. It's destributed under the terms of GPL.
    You can use it to grab HTML documents begin with specified url.
    When you give a url, it'll first grab it, and parses all the links
    in the page, then grab them all. Enjoy it!
    
    Command line usage:
    
            python crawl.py [-p proxy|-t num|-f configfile] [url|-r]
            -p proxy = http://[username:password@]hostname:port
            -t num thread num
            -f configfile default use .crawl.ini file
    
    or      python crawl.py -u
    '''
    
    def main():
        global proxyflag
        global inifile
        try:
            opts, args = getopt.getopt(sys.argv[1:], "p:urt:f:", [])
        except getopt.GetoptError:
            usage()
            sys.exit(2)
        proxyhost=''
        resume=0
        seen=[]
        threadnum = 10
        for o, a in opts:
            if o == '-u':
                usage()
                sys.exit()
            elif o == '-p':
                proxyhost=a
                proxyflag=1
            elif o == '-r':
                url=[u.strip() for u in open(downfile).readlines()]
                seen=[u.strip() for u in open(seenfile).readlines() if u.strip()]
            elif o == '-t':
                try:
                    threadnum = int(a)
                except:
                    threadnum = 10
                if threadnum == 0:
                    threadnum = 10
            elif o == '-f':
                inifile = a
    
        #args[0] = "http://localhost:8088/index.html"
        url=""
        if len(args) > 0:
            url = [args[0]]
        else:
            if not url:
                try:
                    u = raw_input('Enter starting URL: ')
                    url = [u]
                except (KeyboardInterrupt, EOFError):
                    url = ''
        if proxyhost:
    #        proxy=urllib2.ProxyHandler({'http':'http://www:www@11.133.232.19:8080'})
            print "\nProxy is: %s" % proxyhost
            proxy=urllib2.ProxyHandler({'http':proxyhost})
            opener=urllib2.build_opener(proxy)
            urllib2.install_opener(opener)
    
        if not url: return
    
        ini = ConfigParser.ConfigParser()
        ini.read(inifile)
        
        exts = []
        if ini.has_option('default', 'exts'):
            exts = ini.get('default', 'exts').split()
        if not exts:
            exts = ['.htm', '.html', '.gif', '.jpg', '.png', '.py', '.txt', '.css', '.js', '.aspx']
    
        htmlexts = []
        if ini.has_option('default', 'htmlexts'):
            htmlexts = ini.get('default', 'htmlexts').split()
        if not htmlexts:
            htmlexts = ['.htm', '.html']
        
        logger = initlog()
        
        robot = Crawler(url, seen, exts, htmlexts)
        robot.go(threadnum)
    
    if __name__ == '__main__':
        main()

    网友留言/评论

    我要留言/评论