def crawl_from_queue(self):
while not self.stopevent.is_set():
- (url, relprio) = self.queue.get()
+ (url, relprio, internal) = self.queue.get()
try:
- self.crawl_page(url, relprio)
+ self.crawl_page(url, relprio, internal)
except Exception, e:
log("Exception crawling '%s': %s" % (url, e))
self.queue.task_done()
def exclude_url(self, url):
return False
- def crawl_page(self, url, relprio):
+ def crawl_page(self, url, relprio, internal):
if self.pages_crawled.has_key(url) or self.pages_crawled.has_key(url+"/"):
return
log(e)
return
- self.save_page(url, lastmod, relprio)
+ self.save_page(url, lastmod, relprio, internal)
self.post_process_page(url)
- def save_page(self, url, lastmod, relprio):
+ def save_page(self, url, lastmod, relprio, internal):
if relprio == 0.0:
relprio = 0.5
params = {
'site': self.siteid,
'url': url,
'relprio': relprio,
+ 'internal': internal,
}
curs = self.dbconn.cursor()
- curs.execute("UPDATE webpages SET title=%(title)s, txt=%(txt)s, fti=to_tsvector(%(txt)s), lastscanned=%(lastmod)s, relprio=%(relprio)s WHERE site=%(site)s AND suburl=%(url)s", params)
+ curs.execute("UPDATE webpages SET title=%(title)s, txt=%(txt)s, fti=to_tsvector(%(txt)s), lastscanned=%(lastmod)s, relprio=%(relprio)s, isinternal=%(internal)s WHERE site=%(site)s AND suburl=%(url)s", params)
if curs.rowcount != 1:
- curs.execute("INSERT INTO webpages (site, suburl, title, txt, fti, lastscanned, relprio) VALUES (%(site)s, %(url)s, %(title)s, %(txt)s, to_tsvector(%(txt)s), %(lastmod)s, %(relprio)s)", params)
+ curs.execute("INSERT INTO webpages (site, suburl, title, txt, fti, lastscanned, relprio, isinternal) VALUES (%(site)s, %(url)s, %(title)s, %(txt)s, to_tsvector(%(txt)s), %(lastmod)s, %(relprio)s, %(internal)s)", params)
with self.counterlock:
self.pages_new += 1
else:
self.extra_excludes = [re.compile(x) for x, in curs.fetchall()]
# We *always* crawl the root page, of course
- self.queue.put(("/", 0.5))
+ self.queue.put(("/", 0.5, False))
# Now do all the other pages
for x in allpages:
- self.queue.put((x, 0.5))
+ self.queue.put((x, 0.5, False))
def exclude_url(self, url):
if self.robots and self.robots.block_url(url):
return False
def queue_url(self, url):
- self.queue.put((url.strip(), 0.5))
+ self.queue.put((url.strip(), 0.5, False))
def post_process_page(self, url):
for l in self.resolve_links(self.page.links, url):
self.getprio = False
self.getlastmod = False
self.currstr = ""
+ self.internal = False
self.urls = []
- def parse(self, f):
+ def parse(self, f, internal=False):
self.parser.StartElementHandler = lambda name,attrs: self.processelement(name,attrs)
self.parser.EndElementHandler = lambda name: self.processendelement(name)
self.parser.CharacterDataHandler = lambda data: self.processcharacterdata(data)
+ self.internal = internal
self.parser.ParseFile(f)
self.getlastmod = False
self.currentlastmod = dateutil.parser.parse(self.currstr)
elif name == "url":
- self.urls.append((self.currenturl, self.currentprio, self.currentlastmod))
+ self.urls.append((self.currenturl, self.currentprio, self.currentlastmod, self.internal))
def processcharacterdata(self, data):
if self.geturl or self.getprio or self.getlastmod:
p.parse(u)
u.close()
- for url, prio, lastmod in p.urls:
+ # Attempt to fetch a sitempa_internal.xml. This is used to index
+ # pages on our internal search engine that we don't want on
+ # Google. They should also be excluded from default search
+ # results (unless searching with a specific suburl)
+ u = urllib.urlopen("https://%s/sitemap_internal.xml" % self.hostname)
+ if u.getcode() == 200:
+ p.parse(u, True)
+ u.close()
+
+ for url, prio, lastmod, internal in p.urls:
# Advance 8 characters - length of https://.
url = url[len(self.hostname)+8:]
if lastmod:
# to make sure we don't remove it...
self.pages_crawled[url] = 1
continue
- self.queue.put((url, prio))
+ self.queue.put((url, prio, internal))
log("About to crawl %s pages from sitemap" % self.queue.qsize())
suburl varchar(512) NOT NULL,
title varchar(128) NOT NULL,
relprio float NOT NULL DEFAULT 0.5,
+ internal boolean NOT NULL DEFAULT 'f',
lastscanned timestamptz NULL,
txt text NOT NULL,
fti tsvector NOT NULL