1- #!/usr/bin/python
2- import sys
3- import re
4- import urllib2
5- import urlparse
6- import threading
7- import sqlite3 as sqlite
8- import robotparser
9- # Try to import psyco for JIT compilation
1+ from query import CrawlerDb
2+ from content_processor import ContentProcessor
3+ from settings import VERBOSE
4+ import sys , urlparse , urllib2
105
6+ # ===== Init stuff =====
117
12- """
13- The program should take arguments
14- 1) database file name
15- 2) start url
16- 3) crawl depth
17- 4) domains to limit to, regex (optional)
18- 5) verbose (optional)
19- Start out by checking to see if the args are there and
20- set them to their variables
21- """
22- if len (sys .argv ) < 4 :
23- sys .exit ("Not enough arguments!" )
24- else :
25- dbname = sys .argv [1 ]
26- starturl = sys .argv [2 ]
27- crawldepth = int (sys .argv [3 ])
28- if len (sys .argv ) >= 5 :
29- domains = sys .argv [4 ]
30- if len (sys .argv ) == 6 :
31- if (sys .argv [5 ].upper () == "TRUE" ):
32- verbose = True
33- else :
34- verbose = False
35- else :
36- domains = False
37- verbose = False
38- # urlparse the start url
39- surlparsed = urlparse .urlparse (starturl )
8+ # db init
9+ cdb = CrawlerDb ()
10+ cdb .connect ()
4011
41- # Connect to the db and create the tables if they don't already exist
42- connection = sqlite .connect (dbname )
43- cursor = connection .cursor ()
44- # crawl_index: holds all the information of the urls that have been crawled
45- cursor .execute ('CREATE TABLE IF NOT EXISTS crawl_index (crawlid INTEGER, parentid INTEGER, url VARCHAR(256), title VARCHAR(256), keywords VARCHAR(256), status INTEGER )' )
46- # queue: this should be obvious
47- cursor .execute ('CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, parent INTEGER, depth INTEGER, url VARCHAR(256))' )
48- # status: Contains a record of when crawling was started and stopped.
49- # Mostly in place for a future application to watch the crawl interactively.
50- cursor .execute ('CREATE TABLE IF NOT EXISTS status ( s INTEGER, t TEXT )' )
51- connection .commit ()
12+ # content processor init
13+ processor = ContentProcessor (None , None , None )
5214
53- # Compile keyword and link regex expressions
54- keywordregex = re .compile ('<meta\sname=["\' ]keywords["\' ]\scontent=["\' ](.*?)["\' ]\s/>' )
55- linkregex = re .compile ('<a\s(?:.*?\s)*?href=[\' "](.*?)[\' "].*?>' )
56- if domains :
57- domainregex = re .compile (domains )
58- else :
59- domainregex = False
60- crawled = []
15+ if len (sys .argv ) < 2 :
16+ print "Error: No start url was passed"
17+ sys .exit ()
6118
62- # set crawling status and stick starting url into the queue
63- cursor .execute ("INSERT INTO status VALUES ((?), (?))" , (1 , "datetime('now')" ))
64- cursor .execute ("INSERT INTO queue VALUES ((?), (?), (?), (?))" , (None , 0 , 0 , starturl ))
65- connection .commit ()
19+ l = sys .argv [1 :]
6620
21+ cdb .enqueue (l )
6722
68- # insert starting url into queue
23+ def crawl ():
24+ print "starting..."
25+ queue_empty = False
26+ while True :
27+ url = cdb .dequeue ()
28+ print url
29+ if cdb .checkCrawled (url ):
30+ continue
31+ if url is False :
32+ queue_empty = True
6933
70- class threader ( threading .Thread ):
71-
72- # Parser for robots.txt that helps determine if we are allowed to fetch a url
73- rp = robotparser .RobotFileParser ()
74-
75- """
76- run()
77- Args:
78- none
79- the run() method contains the main loop of the program. Each iteration takes the url
80- at the top of the queue and starts the crawl of it.
81- """
82- def run (self ):
83- while 1 :
84- try :
85- # Get the first item from the queue
86- cursor .execute ("SELECT * FROM queue LIMIT 1" )
87- crawling = cursor .fetchone ()
88- # Remove the item from the queue
89- cursor .execute ("DELETE FROM queue WHERE id = (?)" , (crawling [0 ], ))
90- connection .commit ()
91- if verbose :
92- print crawling [3 ]
93- except KeyError :
94- raise StopIteration
95- except :
96- pass
97-
98- # if theres nothing in the que, then set the status to done and exit
99- if crawling == None :
100- cursor .execute ("INSERT INTO status VALUES ((?), datetime('now'))" , (0 ,))
101- connection .commit ()
102- sys .exit ("Done!" )
103- # Crawl the link
104- self .crawl (crawling )
105-
106- """
107- crawl()
108- Args:
109- crawling: this should be a url
110-
111- crawl() opens the page at the "crawling" url, parses it and puts it into the database.
112- It looks for the page title, keywords, and links.
113- """
114- def crawl (self , crawling ):
115- # crawler id
116- cid = crawling [0 ]
117- # parent id. 0 if start url
118- pid = crawling [1 ]
119- # current depth
120- curdepth = crawling [2 ]
121- # crawling urL
122- curl = crawling [3 ]
123- if domainregex and not domainregex .search (curl ):
124- return
125- # Split the link into its sections
126- url = urlparse .urlparse (curl )
127-
34+ # Get HTTPConnection
35+ #connection = httplib.HTTPConnection(parsed_url.netloc)
36+ # Make the request
37+ #connection.request("GET", parsed_url.path)
38+ # Get response
39+ #response = connection.getresponse()
40+ #data = response.read()
41+ status = 0
42+ request = None
12843 try :
129- # Have our robot parser grab the robots.txt file and read it
130- self .rp .set_url ('http://' + url [1 ] + '/robots.txt' )
131- self .rp .read ()
132-
133- # If we're not allowed to open a url, return the function to skip it
134- if not self .rp .can_fetch ('PyCrawler' , curl ):
135- if verbose :
136- print curl + " not allowed by robots.txt"
137- return
138- except :
139- pass
140-
141- try :
142- # Add the link to the already crawled list
143- crawled .append (curl )
144- except MemoryError :
145- # If the crawled array is too big, deleted it and start over
146- del crawled [:]
147- try :
148- # Create a Request object
149- request = urllib2 .Request (curl )
150- # Add user-agent header to the request
151- request .add_header ("User-Agent" , "PyCrawler" )
152- # Build the url opener, open the link and read it into msg
153- opener = urllib2 .build_opener ()
154- f = opener .open (request )
155- msg = f .read ()
156- # put meta data in info
157- info = f .info ()
158-
159-
44+ request = urllib2 .urlopen (str (url ))
16045 except urllib2 .URLError , e :
161- # If it doesn't load, skip this url
162- #print e.code
163- try :
164- cursor .execute ("INSERT INTO crawl_index VALUES( (?), (?), (?), (?), (?), (?) )" , (cid , pid , curl , '' , '' , e .code ))
165- connection .commit
166- except :
167- pass
46+ print e .reason
47+ except urllib2 .HTTPError , e :
48+ status = e .code
49+ if status == 0 :
50+ status = 200
51+ data = request .read ()
16852
169- return
170-
171- # Find what's between the title tags
172- startPos = msg .find ('<title>' )
173- if startPos != - 1 :
174- endPos = msg .find ('</title>' , startPos + 7 )
175- if endPos != - 1 :
176- title = msg [startPos + 7 :endPos ]
177-
178- # Start keywords list with whats in the keywords meta tag if there is one
179- keywordlist = keywordregex .findall (msg )
180- if len (keywordlist ) > 0 :
181- keywordlist = keywordlist [0 ]
182- else :
183- keywordlist = ""
184-
185-
186-
187- # Get the links
188- links = linkregex .findall (msg )
189- # queue up the links
190- self .queue_links (url , links , cid , curdepth )
53+ if VERBOSE :
54+ print "Got %s status from %s" % (status , url )
55+ processor .setInfo (str (url ), status , data )
56+ add_queue = processor .process ()
57+ l = len (add_queue )
58+ print "Found %i links" % l
59+ if l > 0 :
60+ if queue_empty == True :
61+ queue_empty = False
62+ cdb .enqueue (add_queue )
63+ cdb .addPage (processor .getDataDict ())
64+ processor .reset ()
65+ if queue_empty :
66+ break
19167
192- try :
193- # Put now crawled link into the db
194- cursor .execute ("INSERT INTO crawl_index VALUES( (?), (?), (?), (?), (?), (?) )" , (cid , pid , curl , title , keywordlist , 200 ))
195- connection .commit ()
196- except :
197- pass
198-
199-
200- def queue_links (self , url , links , cid , curdepth ):
201- if curdepth < crawldepth :
202- # Read the links and inser them into the queue
203- for link in links :
204- cursor .execute ("SELECT url FROM queue WHERE url=?" , [link ])
205- for row in cursor :
206- if row [0 ].decode ('utf-8' ) == url :
207- continue
208- if link .startswith ('/' ):
209- link = 'http://' + url [1 ] + link
210- elif link .startswith ('#' ):
211- continue
212- elif not link .startswith ('http' ):
213- link = urlparse .urljoin (url .geturl (),link )
214-
215- if link .decode ('utf-8' ) not in crawled :
216- try :
217- cursor .execute ("INSERT INTO queue VALUES ( (?), (?), (?), (?) )" , (None , cid , curdepth + 1 , link ))
218- connection .commit ()
219- except :
220- continue
221- else :
222- pass
223- if __name__ == '__main__' :
224- try :
225- import psyco
226- psyco .full ()
227- except ImportError :
228- print "Continuing without psyco JIT compilation!"
229- # Run main loop
230- threader ().run ()
68+ print "finishing..."
69+ cdb .close ()
70+ print "done! goodbye!"
71+
72+ if __name__ == "__main__" :
73+ crawl ()
0 commit comments