From 7edb14284dba2b521249731073ad0b44267cb479 Mon Sep 17 00:00:00 2001 From: Magnus Hagander Date: Thu, 23 Mar 2017 16:31:39 +0100 Subject: [PATCH] Teach search crawler about internal sitemap We only support it for our main website, which uses a sitemap, so implement it only for that provider. And always probe sitemap_internal.xml, since we don't even try to access any external sites on it. --- tools/search/crawler/lib/basecrawler.py | 15 ++++++++------- tools/search/crawler/lib/genericsite.py | 6 +++--- tools/search/crawler/lib/sitemapsite.py | 19 +++++++++++++++---- tools/search/sql/schema.sql | 1 + 4 files changed, 27 insertions(+), 14 deletions(-) diff --git a/tools/search/crawler/lib/basecrawler.py b/tools/search/crawler/lib/basecrawler.py index 95a748cc..62934c6d 100644 --- a/tools/search/crawler/lib/basecrawler.py +++ b/tools/search/crawler/lib/basecrawler.py @@ -81,9 +81,9 @@ class BaseSiteCrawler(object): def crawl_from_queue(self): while not self.stopevent.is_set(): - (url, relprio) = self.queue.get() + (url, relprio, internal) = self.queue.get() try: - self.crawl_page(url, relprio) + self.crawl_page(url, relprio, internal) except Exception, e: log("Exception crawling '%s': %s" % (url, e)) self.queue.task_done() @@ -91,7 +91,7 @@ class BaseSiteCrawler(object): def exclude_url(self, url): return False - def crawl_page(self, url, relprio): + def crawl_page(self, url, relprio, internal): if self.pages_crawled.has_key(url) or self.pages_crawled.has_key(url+"/"): return @@ -131,10 +131,10 @@ class BaseSiteCrawler(object): log(e) return - self.save_page(url, lastmod, relprio) + self.save_page(url, lastmod, relprio, internal) self.post_process_page(url) - def save_page(self, url, lastmod, relprio): + def save_page(self, url, lastmod, relprio, internal): if relprio == 0.0: relprio = 0.5 params = { @@ -144,11 +144,12 @@ class BaseSiteCrawler(object): 'site': self.siteid, 'url': url, 'relprio': relprio, + 'internal': internal, } curs = self.dbconn.cursor() - curs.execute("UPDATE webpages SET title=%(title)s, txt=%(txt)s, fti=to_tsvector(%(txt)s), lastscanned=%(lastmod)s, relprio=%(relprio)s WHERE site=%(site)s AND suburl=%(url)s", params) + curs.execute("UPDATE webpages SET title=%(title)s, txt=%(txt)s, fti=to_tsvector(%(txt)s), lastscanned=%(lastmod)s, relprio=%(relprio)s, isinternal=%(internal)s WHERE site=%(site)s AND suburl=%(url)s", params) if curs.rowcount != 1: - curs.execute("INSERT INTO webpages (site, suburl, title, txt, fti, lastscanned, relprio) VALUES (%(site)s, %(url)s, %(title)s, %(txt)s, to_tsvector(%(txt)s), %(lastmod)s, %(relprio)s)", params) + curs.execute("INSERT INTO webpages (site, suburl, title, txt, fti, lastscanned, relprio, isinternal) VALUES (%(site)s, %(url)s, %(title)s, %(txt)s, to_tsvector(%(txt)s), %(lastmod)s, %(relprio)s, %(internal)s)", params) with self.counterlock: self.pages_new += 1 else: diff --git a/tools/search/crawler/lib/genericsite.py b/tools/search/crawler/lib/genericsite.py index 6aa5780e..53775164 100644 --- a/tools/search/crawler/lib/genericsite.py +++ b/tools/search/crawler/lib/genericsite.py @@ -24,11 +24,11 @@ class GenericSiteCrawler(BaseSiteCrawler): self.extra_excludes = [re.compile(x) for x, in curs.fetchall()] # We *always* crawl the root page, of course - self.queue.put(("/", 0.5)) + self.queue.put(("/", 0.5, False)) # Now do all the other pages for x in allpages: - self.queue.put((x, 0.5)) + self.queue.put((x, 0.5, False)) def exclude_url(self, url): if self.robots and self.robots.block_url(url): @@ -39,7 +39,7 @@ class GenericSiteCrawler(BaseSiteCrawler): return False def queue_url(self, url): - self.queue.put((url.strip(), 0.5)) + self.queue.put((url.strip(), 0.5, False)) def post_process_page(self, url): for l in self.resolve_links(self.page.links, url): diff --git a/tools/search/crawler/lib/sitemapsite.py b/tools/search/crawler/lib/sitemapsite.py index 3d0ebd1b..81c2f438 100644 --- a/tools/search/crawler/lib/sitemapsite.py +++ b/tools/search/crawler/lib/sitemapsite.py @@ -15,12 +15,14 @@ class SitemapParser(object): self.getprio = False self.getlastmod = False self.currstr = "" + self.internal = False self.urls = [] - def parse(self, f): + def parse(self, f, internal=False): self.parser.StartElementHandler = lambda name,attrs: self.processelement(name,attrs) self.parser.EndElementHandler = lambda name: self.processendelement(name) self.parser.CharacterDataHandler = lambda data: self.processcharacterdata(data) + self.internal = internal self.parser.ParseFile(f) @@ -50,7 +52,7 @@ class SitemapParser(object): self.getlastmod = False self.currentlastmod = dateutil.parser.parse(self.currstr) elif name == "url": - self.urls.append((self.currenturl, self.currentprio, self.currentlastmod)) + self.urls.append((self.currenturl, self.currentprio, self.currentlastmod, self.internal)) def processcharacterdata(self, data): if self.geturl or self.getprio or self.getlastmod: @@ -68,7 +70,16 @@ class SitemapSiteCrawler(BaseSiteCrawler): p.parse(u) u.close() - for url, prio, lastmod in p.urls: + # Attempt to fetch a sitempa_internal.xml. This is used to index + # pages on our internal search engine that we don't want on + # Google. They should also be excluded from default search + # results (unless searching with a specific suburl) + u = urllib.urlopen("https://%s/sitemap_internal.xml" % self.hostname) + if u.getcode() == 200: + p.parse(u, True) + u.close() + + for url, prio, lastmod, internal in p.urls: # Advance 8 characters - length of https://. url = url[len(self.hostname)+8:] if lastmod: @@ -79,7 +90,7 @@ class SitemapSiteCrawler(BaseSiteCrawler): # to make sure we don't remove it... self.pages_crawled[url] = 1 continue - self.queue.put((url, prio)) + self.queue.put((url, prio, internal)) log("About to crawl %s pages from sitemap" % self.queue.qsize()) diff --git a/tools/search/sql/schema.sql b/tools/search/sql/schema.sql index 83573259..2ed25733 100644 --- a/tools/search/sql/schema.sql +++ b/tools/search/sql/schema.sql @@ -32,6 +32,7 @@ CREATE TABLE webpages ( suburl varchar(512) NOT NULL, title varchar(128) NOT NULL, relprio float NOT NULL DEFAULT 0.5, + internal boolean NOT NULL DEFAULT 'f', lastscanned timestamptz NULL, txt text NOT NULL, fti tsvector NOT NULL -- 2.39.5