Work on colors, Added option to rotate sqlite database files

WilHall · WilHall · commit ad55cd82161c · 2011-11-01T19:04:02.000-04:00
diff --git a/PyCrawler.db.1 b/PyCrawler.db.1
diff --git a/PyCrawler.py b/PyCrawler.py
@@ -1,7 +1,7 @@
 from query import CrawlerDb
 from content_processor import ContentProcessor
-from settings import VERBOSE, COLOR_ERROR, COLOR_SUCCESS
-import sys, urlparse, urllib2
+from settings import VERBOSE, USE_COLORS, DATABASE_ENGINE, DATABASE_NAME, SQLITE_ROTATE_DATABASE_ON_STARTUP
+import sys, urlparse, urllib2, shutil, glob, robotparser
 import cPrinter
 
 # ===== Init stuff =====
@@ -14,36 +14,58 @@
 processor = ContentProcessor(None, None, None)
 
 # get cprinter
-printer = cPrinter.Printer(COLOR_SUCCESS, COLOR_ERROR)
+printer = cPrinter.Printer(USE_COLORS)
+
+# robot parser init
+robot = robotparser.RobotFileParser()
 
 if len(sys.argv) < 2:
-	printer.p("Error: No start url was passed", printer.error)
+	printer.p("Error: No start url was passed", printer.other)
 	sys.exit()
 
 l = sys.argv[1:]
 
 cdb.enqueue(l)
 
 def crawl():
-	printer.p("starting...", printer.success)
-	queue_empty = False
+	printer.p("starting...", printer.other)
 	while True:
 		url = cdb.dequeue()
+		u = urlparse.urlparse(url)
+		robot.set_url('http://'+u[1]+"/robots.txt")
+		if not robot.can_fetch('PyCrawler', url):
+			printer.p("Url disallowed by robots.txt: %s " % url, printer.other)
+			continue
+		if not url.startswith('http'):
+			printer.p("Unfollowable link found at %s " % url, printer.other)
+			continue
+
 		if cdb.checkCrawled(url):
 			continue
 		if url is False:
-			queue_empty = True
+			break
 		status = 0
 		request = None
 		try:
 			request = urllib2.urlopen(str(url))
 		except urllib2.URLError, e:
 			printer.p(e.reason, printer.error)
+			printer.p("Exception at url: %s" % url, printer.error)
+			
+			continue
 		except urllib2.HTTPError, e:
 			status = e.code
 		if status == 0:
 			status = 200
 		data = request.read()
+		processor.setInfo(str(url), status, data)
+		ret = processor.process()
+		if status != 200:
+			continue
+		add_queue = []
+		for q in ret:
+			if not cdb.checkCrawled(q):
+				add_queue.append(q)
 
 		processor.setInfo(str(url), status, data)
 		add_queue = processor.process()
@@ -52,17 +74,26 @@ def crawl():
 			printer.p("Got %s status from %s" % (status, url), printer.success)
 			printer.p("Found %i links" % l, printer.success)
 		if l > 0:
-			if queue_empty == True:
-				queue_empty = False
 			cdb.enqueue(add_queue)	
 		cdb.addPage(processor.getDataDict())
 		processor.reset()
-		if queue_empty:
-			break
 
-	printer.p("finishing...", printer.success)
+	printer.p("finishing...", printer.other)
 	cdb.close()
 	printer.p("done! goodbye!", printer.success)
 
 if __name__ == "__main__":
-	crawl()
+	if DATABASE_ENGINE == "sqlite" and SQLITE_ROTATE_DATABASE_ON_STARTUP:
+		dbs = glob.glob("*.db*")
+		index = 1;
+		while("%s.db.%s" % (DATABASE_NAME, index) in dbs):
+			index += 1
+		shutil.copy2(dbs[len(dbs)-1], "%s.db.%s" % (DATABASE_NAME, index))
+	try:
+		crawl()
+	except KeyboardInterrupt:
+		printer.p("Stopping", printer.error)
+		sys.exit()
+	except Exception, e:
+		printer.p("EXCEPTION: %s " % e, printer.error)
+	
diff --git a/cPrinter.py b/cPrinter.py
@@ -2,10 +2,13 @@
 
 class Printer():
 
-	def __init__(self, COLOR_SUCCESS, COLOR_ERROR):
+	def __init__(self, USE_COLORS):
 		# Define our types
 		self.success = 0;
 		self.error = 1;
+		self.other = 2;
+
+		self.USE_COLORS = USE_COLORS
 
 		# Initialize environment
 		curses.setupterm()
@@ -16,19 +19,18 @@ def __init__(self, COLOR_SUCCESS, COLOR_ERROR):
 		#Get the normal attribute
 		self.COLOR_NORMAL = curses.tigetstr('sgr0')
 
-		# Initialize custom colors to the first two slots
-		curses.initscr()
-		curses.start_color()
-		curses.init_color(0, COLOR_SUCCESS[0], COLOR_SUCCESS[1], COLOR_SUCCESS[2])
-		curses.init_color(1, COLOR_ERROR[0], COLOR_ERROR[1], COLOR_ERROR[2])
-		curses.endwin()
-
 		# Get + Save the color sequences
-		self.COLOR_SUCCESS = curses.tparm(self.fcap, 0)
-		self.COLOR_ERROR = curses.tparm(self.fcap, 1)
+		self.COLOR_SUCCESS = curses.tparm(self.fcap, curses.COLOR_GREEN)
+		self.COLOR_ERROR = curses.tparm(self.fcap, curses.COLOR_RED)
+		self.COLOR_OTHER = curses.tparm(self.fcap, curses.COLOR_YELLOW)
 
 	def p(self, text, type):
-		if type == self.success:
-			print "%s%s%s" % (self.COLOR_SUCCESS, text, self.COLOR_NORMAL)
-		elif type == self.error:
-			print "%s%s%s" % (self.COLOR_SUCCESS, text, self.COLOR_NORMAL)
+		if self.USE_COLORS:
+			if type == self.success:
+				print "%s[*] %s%s" % (self.COLOR_SUCCESS, text, self.COLOR_NORMAL)
+			elif type == self.error:
+				print "%s[!] %s%s" % (self.COLOR_ERROR, text, self.COLOR_NORMAL)
+			elif type == self.other:
+				print "%s[.] %s%s" % (self.COLOR_OTHER, text, self.COLOR_NORMAL)
+		else:
+			print text
diff --git a/content_processor.py b/content_processor.py
@@ -20,6 +20,7 @@ def rankKeywords(text):
 def stripPunctuation(text):
 	pattern = re.compile(r'[^\w\s]')
 	return pattern.sub('', text)
+
 class ContentProcessor:
 	
 	def __init__(self, url, status, text):
@@ -114,4 +115,7 @@ def process(self):
 		return queue
 
 	def getDataDict(self):
+		for k,v in self.keywords.items():
+			if v < 3:
+				del self.keywords[k]
 		return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords}
diff --git a/query.py b/query.py
@@ -56,7 +56,7 @@ def enqueue(self, urls):
 			return False
 		if len(urls) == 0:
 			return True
-		args = [{'address':u} for u in urls]
+		args = [{'address':unicode(u)} for u in urls]
 		result = self.connection.execute(self.queue_table.insert(), args)
 		if result:
 			return True
@@ -81,7 +81,7 @@ def dequeue(self):
 		return False
 	
 	def checkCrawled(self, url):
-		s =  select([self.crawl_table]).where(self.crawl_table.c.address == url)
+		s =  select([self.crawl_table]).where(self.crawl_table.c.address == unicode(url))
 		result = self.connection.execute(s)
 		if len(result.fetchall()) > 0:
 			result.close()
@@ -100,7 +100,7 @@ def addPage(self, data):
 		if not self.connected:
 			return False
 		# Add the page to the crawl table
-		result = self.connection.execute(self.crawl_table.insert().values(address=data['address'],http_status=data['status'],title=data['title'],size=data['size']))
+		result = self.connection.execute(self.crawl_table.insert().values(address=unicode(data['address']),http_status=data['status'],title=unicode(data['title']),size=data['size']))
 		if not result:
 			return False
 		# generate list of argument dictionaries for the insert many statement
diff --git a/settings.py b/settings.py
@@ -7,10 +7,8 @@
 DATABASE_USER = ""				# Not used with sqlite
 DATABASE_PASS = ""				# Not used with sqlite
 
-VERBOSE = True
+SQLITE_ROTATE_DATABASE_ON_STARTUP = True # Rotate the database to a new one on startup
 
-# These values are for the text output colors.
-# List values are 0-255 RGB values, respectively.
+VERBOSE = True
 
-COLOR_SUCCESS = [0, 255, 0]		# Success Color (Green)
-COLOR_ERROR = [255, 0, 0]		# Error Color (Red)
+USE_COLORS = True 				# Whether or not colors should be used when printing text