Skip to content

Commit 38593b6

Browse files
committed
Changes to the content processor
1 parent ca6d7a8 commit 38593b6

File tree

1 file changed

+6
-2
lines changed

1 file changed

+6
-2
lines changed

content_processor.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,11 @@ def rankKeywords(text):
2121

2222
def stripPunctuation(text):
2323
pattern = re.compile(r'[^\w\s]')
24-
return pattern.sub('', text)
24+
return pattern.sub(' ', text)
25+
26+
def stripScript(text):
27+
pattern = re.compile(r'<script.*?\/script>')
28+
return pattern.sub(' ', text)
2529

2630
class ContentProcessor:
2731

@@ -77,7 +81,7 @@ def combineKeywordLists(self):
7781
def processBody(self):
7882
queue = ready_queue(self.url, self.body)
7983
#print "found %i links to queue" % len(queue)
80-
self.text = stripPunctuation(self.remove_html_tags(self.body))
84+
self.text = stripPunctuation(self.remove_html_tags(stripScript(self.body)))
8185
if len(self.text) > 5000:
8286
offset = 0
8387
i = 0

0 commit comments

Comments
 (0)