11from query import CrawlerDb
22from content_processor import ContentProcessor
3- from settings import VERBOSE , COLOR_ERROR , COLOR_SUCCESS
4- import sys , urlparse , urllib2
3+ from settings import VERBOSE , USE_COLORS , DATABASE_ENGINE , DATABASE_NAME , SQLITE_ROTATE_DATABASE_ON_STARTUP
4+ import sys , urlparse , urllib2 , shutil , glob , robotparser
55import cPrinter
66
77# ===== Init stuff =====
1414processor = ContentProcessor (None , None , None )
1515
1616# get cprinter
17- printer = cPrinter .Printer (COLOR_SUCCESS , COLOR_ERROR )
17+ printer = cPrinter .Printer (USE_COLORS )
18+
19+ # robot parser init
20+ robot = robotparser .RobotFileParser ()
1821
1922if len (sys .argv ) < 2 :
20- printer .p ("Error: No start url was passed" , printer .error )
23+ printer .p ("Error: No start url was passed" , printer .other )
2124 sys .exit ()
2225
2326l = sys .argv [1 :]
2427
2528cdb .enqueue (l )
2629
2730def crawl ():
28- printer .p ("starting..." , printer .success )
29- queue_empty = False
31+ printer .p ("starting..." , printer .other )
3032 while True :
3133 url = cdb .dequeue ()
34+ u = urlparse .urlparse (url )
35+ robot .set_url ('http://' + u [1 ]+ "/robots.txt" )
36+ if not robot .can_fetch ('PyCrawler' , url ):
37+ printer .p ("Url disallowed by robots.txt: %s " % url , printer .other )
38+ continue
39+ if not url .startswith ('http' ):
40+ printer .p ("Unfollowable link found at %s " % url , printer .other )
41+ continue
42+
3243 if cdb .checkCrawled (url ):
3344 continue
3445 if url is False :
35- queue_empty = True
46+ break
3647 status = 0
3748 request = None
3849 try :
3950 request = urllib2 .urlopen (str (url ))
4051 except urllib2 .URLError , e :
4152 printer .p (e .reason , printer .error )
53+ printer .p ("Exception at url: %s" % url , printer .error )
54+
55+ continue
4256 except urllib2 .HTTPError , e :
4357 status = e .code
4458 if status == 0 :
4559 status = 200
4660 data = request .read ()
61+ processor .setInfo (str (url ), status , data )
62+ ret = processor .process ()
63+ if status != 200 :
64+ continue
65+ add_queue = []
66+ for q in ret :
67+ if not cdb .checkCrawled (q ):
68+ add_queue .append (q )
4769
4870 processor .setInfo (str (url ), status , data )
4971 add_queue = processor .process ()
@@ -52,17 +74,26 @@ def crawl():
5274 printer .p ("Got %s status from %s" % (status , url ), printer .success )
5375 printer .p ("Found %i links" % l , printer .success )
5476 if l > 0 :
55- if queue_empty == True :
56- queue_empty = False
5777 cdb .enqueue (add_queue )
5878 cdb .addPage (processor .getDataDict ())
5979 processor .reset ()
60- if queue_empty :
61- break
6280
63- printer .p ("finishing..." , printer .success )
81+ printer .p ("finishing..." , printer .other )
6482 cdb .close ()
6583 printer .p ("done! goodbye!" , printer .success )
6684
6785if __name__ == "__main__" :
68- crawl ()
86+ if DATABASE_ENGINE == "sqlite" and SQLITE_ROTATE_DATABASE_ON_STARTUP :
87+ dbs = glob .glob ("*.db*" )
88+ index = 1 ;
89+ while ("%s.db.%s" % (DATABASE_NAME , index ) in dbs ):
90+ index += 1
91+ shutil .copy2 (dbs [len (dbs )- 1 ], "%s.db.%s" % (DATABASE_NAME , index ))
92+ try :
93+ crawl ()
94+ except KeyboardInterrupt :
95+ printer .p ("Stopping" , printer .error )
96+ sys .exit ()
97+ except Exception , e :
98+ printer .p ("EXCEPTION: %s " % e , printer .error )
99+
0 commit comments