Skip to content

Commit ad55cd8

Browse files
committed
Work on colors, Added option to rotate sqlite database files
2 parents 4425b4c + dacd344 commit ad55cd8

File tree

6 files changed

+70
-35
lines changed

6 files changed

+70
-35
lines changed

PyCrawler.db.1

220 KB
Binary file not shown.

PyCrawler.py

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from query import CrawlerDb
22
from content_processor import ContentProcessor
3-
from settings import VERBOSE, COLOR_ERROR, COLOR_SUCCESS
4-
import sys, urlparse, urllib2
3+
from settings import VERBOSE, USE_COLORS, DATABASE_ENGINE, DATABASE_NAME, SQLITE_ROTATE_DATABASE_ON_STARTUP
4+
import sys, urlparse, urllib2, shutil, glob, robotparser
55
import cPrinter
66

77
# ===== Init stuff =====
@@ -14,36 +14,58 @@
1414
processor = ContentProcessor(None, None, None)
1515

1616
# get cprinter
17-
printer = cPrinter.Printer(COLOR_SUCCESS, COLOR_ERROR)
17+
printer = cPrinter.Printer(USE_COLORS)
18+
19+
# robot parser init
20+
robot = robotparser.RobotFileParser()
1821

1922
if len(sys.argv) < 2:
20-
printer.p("Error: No start url was passed", printer.error)
23+
printer.p("Error: No start url was passed", printer.other)
2124
sys.exit()
2225

2326
l = sys.argv[1:]
2427

2528
cdb.enqueue(l)
2629

2730
def crawl():
28-
printer.p("starting...", printer.success)
29-
queue_empty = False
31+
printer.p("starting...", printer.other)
3032
while True:
3133
url = cdb.dequeue()
34+
u = urlparse.urlparse(url)
35+
robot.set_url('http://'+u[1]+"/robots.txt")
36+
if not robot.can_fetch('PyCrawler', url):
37+
printer.p("Url disallowed by robots.txt: %s " % url, printer.other)
38+
continue
39+
if not url.startswith('http'):
40+
printer.p("Unfollowable link found at %s " % url, printer.other)
41+
continue
42+
3243
if cdb.checkCrawled(url):
3344
continue
3445
if url is False:
35-
queue_empty = True
46+
break
3647
status = 0
3748
request = None
3849
try:
3950
request = urllib2.urlopen(str(url))
4051
except urllib2.URLError, e:
4152
printer.p(e.reason, printer.error)
53+
printer.p("Exception at url: %s" % url, printer.error)
54+
55+
continue
4256
except urllib2.HTTPError, e:
4357
status = e.code
4458
if status == 0:
4559
status = 200
4660
data = request.read()
61+
processor.setInfo(str(url), status, data)
62+
ret = processor.process()
63+
if status != 200:
64+
continue
65+
add_queue = []
66+
for q in ret:
67+
if not cdb.checkCrawled(q):
68+
add_queue.append(q)
4769

4870
processor.setInfo(str(url), status, data)
4971
add_queue = processor.process()
@@ -52,17 +74,26 @@ def crawl():
5274
printer.p("Got %s status from %s" % (status, url), printer.success)
5375
printer.p("Found %i links" % l, printer.success)
5476
if l > 0:
55-
if queue_empty == True:
56-
queue_empty = False
5777
cdb.enqueue(add_queue)
5878
cdb.addPage(processor.getDataDict())
5979
processor.reset()
60-
if queue_empty:
61-
break
6280

63-
printer.p("finishing...", printer.success)
81+
printer.p("finishing...", printer.other)
6482
cdb.close()
6583
printer.p("done! goodbye!", printer.success)
6684

6785
if __name__ == "__main__":
68-
crawl()
86+
if DATABASE_ENGINE == "sqlite" and SQLITE_ROTATE_DATABASE_ON_STARTUP:
87+
dbs = glob.glob("*.db*")
88+
index = 1;
89+
while("%s.db.%s" % (DATABASE_NAME, index) in dbs):
90+
index += 1
91+
shutil.copy2(dbs[len(dbs)-1], "%s.db.%s" % (DATABASE_NAME, index))
92+
try:
93+
crawl()
94+
except KeyboardInterrupt:
95+
printer.p("Stopping", printer.error)
96+
sys.exit()
97+
except Exception, e:
98+
printer.p("EXCEPTION: %s " % e, printer.error)
99+

cPrinter.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,13 @@
22

33
class Printer():
44

5-
def __init__(self, COLOR_SUCCESS, COLOR_ERROR):
5+
def __init__(self, USE_COLORS):
66
# Define our types
77
self.success = 0;
88
self.error = 1;
9+
self.other = 2;
10+
11+
self.USE_COLORS = USE_COLORS
912

1013
# Initialize environment
1114
curses.setupterm()
@@ -16,19 +19,18 @@ def __init__(self, COLOR_SUCCESS, COLOR_ERROR):
1619
#Get the normal attribute
1720
self.COLOR_NORMAL = curses.tigetstr('sgr0')
1821

19-
# Initialize custom colors to the first two slots
20-
curses.initscr()
21-
curses.start_color()
22-
curses.init_color(0, COLOR_SUCCESS[0], COLOR_SUCCESS[1], COLOR_SUCCESS[2])
23-
curses.init_color(1, COLOR_ERROR[0], COLOR_ERROR[1], COLOR_ERROR[2])
24-
curses.endwin()
25-
2622
# Get + Save the color sequences
27-
self.COLOR_SUCCESS = curses.tparm(self.fcap, 0)
28-
self.COLOR_ERROR = curses.tparm(self.fcap, 1)
23+
self.COLOR_SUCCESS = curses.tparm(self.fcap, curses.COLOR_GREEN)
24+
self.COLOR_ERROR = curses.tparm(self.fcap, curses.COLOR_RED)
25+
self.COLOR_OTHER = curses.tparm(self.fcap, curses.COLOR_YELLOW)
2926

3027
def p(self, text, type):
31-
if type == self.success:
32-
print "%s%s%s" % (self.COLOR_SUCCESS, text, self.COLOR_NORMAL)
33-
elif type == self.error:
34-
print "%s%s%s" % (self.COLOR_SUCCESS, text, self.COLOR_NORMAL)
28+
if self.USE_COLORS:
29+
if type == self.success:
30+
print "%s[*] %s%s" % (self.COLOR_SUCCESS, text, self.COLOR_NORMAL)
31+
elif type == self.error:
32+
print "%s[!] %s%s" % (self.COLOR_ERROR, text, self.COLOR_NORMAL)
33+
elif type == self.other:
34+
print "%s[.] %s%s" % (self.COLOR_OTHER, text, self.COLOR_NORMAL)
35+
else:
36+
print text

content_processor.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ def rankKeywords(text):
2020
def stripPunctuation(text):
2121
pattern = re.compile(r'[^\w\s]')
2222
return pattern.sub('', text)
23+
2324
class ContentProcessor:
2425

2526
def __init__(self, url, status, text):
@@ -114,4 +115,7 @@ def process(self):
114115
return queue
115116

116117
def getDataDict(self):
118+
for k,v in self.keywords.items():
119+
if v < 3:
120+
del self.keywords[k]
117121
return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords}

query.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def enqueue(self, urls):
5656
return False
5757
if len(urls) == 0:
5858
return True
59-
args = [{'address':u} for u in urls]
59+
args = [{'address':unicode(u)} for u in urls]
6060
result = self.connection.execute(self.queue_table.insert(), args)
6161
if result:
6262
return True
@@ -81,7 +81,7 @@ def dequeue(self):
8181
return False
8282

8383
def checkCrawled(self, url):
84-
s = select([self.crawl_table]).where(self.crawl_table.c.address == url)
84+
s = select([self.crawl_table]).where(self.crawl_table.c.address == unicode(url))
8585
result = self.connection.execute(s)
8686
if len(result.fetchall()) > 0:
8787
result.close()
@@ -100,7 +100,7 @@ def addPage(self, data):
100100
if not self.connected:
101101
return False
102102
# Add the page to the crawl table
103-
result = self.connection.execute(self.crawl_table.insert().values(address=data['address'],http_status=data['status'],title=data['title'],size=data['size']))
103+
result = self.connection.execute(self.crawl_table.insert().values(address=unicode(data['address']),http_status=data['status'],title=unicode(data['title']),size=data['size']))
104104
if not result:
105105
return False
106106
# generate list of argument dictionaries for the insert many statement

settings.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,8 @@
77
DATABASE_USER = "" # Not used with sqlite
88
DATABASE_PASS = "" # Not used with sqlite
99

10-
VERBOSE = True
10+
SQLITE_ROTATE_DATABASE_ON_STARTUP = True # Rotate the database to a new one on startup
1111

12-
# These values are for the text output colors.
13-
# List values are 0-255 RGB values, respectively.
12+
VERBOSE = True
1413

15-
COLOR_SUCCESS = [0, 255, 0] # Success Color (Green)
16-
COLOR_ERROR = [255, 0, 0] # Error Color (Red)
14+
USE_COLORS = True # Whether or not colors should be used when printing text

0 commit comments

Comments
 (0)