We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent ca6d7a8 commit 38593b6Copy full SHA for 38593b6
content_processor.py
@@ -21,7 +21,11 @@ def rankKeywords(text):
21
22
def stripPunctuation(text):
23
pattern = re.compile(r'[^\w\s]')
24
- return pattern.sub('', text)
+ return pattern.sub(' ', text)
25
+
26
+def stripScript(text):
27
+ pattern = re.compile(r'<script.*?\/script>')
28
29
30
class ContentProcessor:
31
@@ -77,7 +81,7 @@ def combineKeywordLists(self):
77
81
def processBody(self):
78
82
queue = ready_queue(self.url, self.body)
79
83
#print "found %i links to queue" % len(queue)
80
- self.text = stripPunctuation(self.remove_html_tags(self.body))
84
+ self.text = stripPunctuation(self.remove_html_tags(stripScript(self.body)))
85
if len(self.text) > 5000:
86
offset = 0
87
i = 0
0 commit comments