Teach search crawler about internal sitemap
authorMagnus Hagander <magnus@hagander.net>
Thu, 23 Mar 2017 15:31:39 +0000 (16:31 +0100)
committerMagnus Hagander <magnus@hagander.net>
Thu, 23 Mar 2017 15:31:39 +0000 (16:31 +0100)
We only support it for our main website, which uses a sitemap, so
implement it only for that provider. And always probe
sitemap_internal.xml, since we don't even try to access any external
sites on it.

tools/search/crawler/lib/basecrawler.py
tools/search/crawler/lib/genericsite.py
tools/search/crawler/lib/sitemapsite.py
tools/search/sql/schema.sql

index 95a748ccb7da11566ba92711804080bfebce0c60..62934c6d8b819fd45dbcf651e3295b7e00290406 100644 (file)
@@ -81,9 +81,9 @@ class BaseSiteCrawler(object):
 
        def crawl_from_queue(self):
                while not self.stopevent.is_set():
-                       (url, relprio) = self.queue.get()
+                       (url, relprio, internal) = self.queue.get()
                        try:
-                               self.crawl_page(url, relprio)
+                               self.crawl_page(url, relprio, internal)
                        except Exception, e:
                                log("Exception crawling '%s': %s" % (url, e))
                        self.queue.task_done()
@@ -91,7 +91,7 @@ class BaseSiteCrawler(object):
        def exclude_url(self, url):
                return False
 
-       def crawl_page(self, url, relprio):
+       def crawl_page(self, url, relprio, internal):
                if self.pages_crawled.has_key(url) or self.pages_crawled.has_key(url+"/"):
                        return
 
@@ -131,10 +131,10 @@ class BaseSiteCrawler(object):
                        log(e)
                        return
 
-               self.save_page(url, lastmod, relprio)
+               self.save_page(url, lastmod, relprio, internal)
                self.post_process_page(url)
 
-       def save_page(self, url, lastmod, relprio):
+       def save_page(self, url, lastmod, relprio, internal):
                if relprio == 0.0:
                        relprio = 0.5
                params = {
@@ -144,11 +144,12 @@ class BaseSiteCrawler(object):
                        'site': self.siteid,
                        'url': url,
                        'relprio': relprio,
+                       'internal': internal,
                        }
                curs = self.dbconn.cursor()
-               curs.execute("UPDATE webpages SET title=%(title)s, txt=%(txt)s, fti=to_tsvector(%(txt)s), lastscanned=%(lastmod)s, relprio=%(relprio)s WHERE site=%(site)s AND suburl=%(url)s", params)
+               curs.execute("UPDATE webpages SET title=%(title)s, txt=%(txt)s, fti=to_tsvector(%(txt)s), lastscanned=%(lastmod)s, relprio=%(relprio)s, isinternal=%(internal)s WHERE site=%(site)s AND suburl=%(url)s", params)
                if curs.rowcount != 1:
-                       curs.execute("INSERT INTO webpages (site, suburl, title, txt, fti, lastscanned, relprio) VALUES (%(site)s, %(url)s, %(title)s, %(txt)s, to_tsvector(%(txt)s), %(lastmod)s, %(relprio)s)", params)
+                       curs.execute("INSERT INTO webpages (site, suburl, title, txt, fti, lastscanned, relprio, isinternal) VALUES (%(site)s, %(url)s, %(title)s, %(txt)s, to_tsvector(%(txt)s), %(lastmod)s, %(relprio)s, %(internal)s)", params)
                        with self.counterlock:
                                self.pages_new += 1
                else:
index 6aa5780e6d2d60c3687f21cad904910e7c628cbf..5377516465c7b7687c90bf581639cdaf9b0484ec 100644 (file)
@@ -24,11 +24,11 @@ class GenericSiteCrawler(BaseSiteCrawler):
                self.extra_excludes = [re.compile(x) for x, in curs.fetchall()]
 
                # We *always* crawl the root page, of course
-               self.queue.put(("/", 0.5))
+               self.queue.put(("/", 0.5, False))
 
                # Now do all the other pages
                for x in allpages:
-                       self.queue.put((x, 0.5))
+                       self.queue.put((x, 0.5, False))
 
        def exclude_url(self, url):
                if self.robots and self.robots.block_url(url):
@@ -39,7 +39,7 @@ class GenericSiteCrawler(BaseSiteCrawler):
                return False
 
        def queue_url(self, url):
-               self.queue.put((url.strip(), 0.5))
+               self.queue.put((url.strip(), 0.5, False))
 
        def post_process_page(self, url):
                for l in self.resolve_links(self.page.links, url):
index 3d0ebd1b94f7dfd68402755370fcc856f01f728f..81c2f438960660983a2f3661a7004acce4f1a8bb 100644 (file)
@@ -15,12 +15,14 @@ class SitemapParser(object):
                self.getprio = False
                self.getlastmod = False
                self.currstr = ""
+               self.internal = False
                self.urls = []
 
-       def parse(self, f):
+       def parse(self, f, internal=False):
                self.parser.StartElementHandler = lambda name,attrs: self.processelement(name,attrs)
                self.parser.EndElementHandler = lambda name: self.processendelement(name)
                self.parser.CharacterDataHandler = lambda data: self.processcharacterdata(data)
+               self.internal = internal
 
                self.parser.ParseFile(f)
 
@@ -50,7 +52,7 @@ class SitemapParser(object):
                        self.getlastmod = False
                        self.currentlastmod = dateutil.parser.parse(self.currstr)
                elif name == "url":
-                       self.urls.append((self.currenturl, self.currentprio, self.currentlastmod))
+                       self.urls.append((self.currenturl, self.currentprio, self.currentlastmod, self.internal))
 
        def processcharacterdata(self, data):
                if self.geturl or self.getprio or self.getlastmod:
@@ -68,7 +70,16 @@ class SitemapSiteCrawler(BaseSiteCrawler):
                p.parse(u)
                u.close()
 
-               for url, prio, lastmod in p.urls:
+               # Attempt to fetch a sitempa_internal.xml. This is used to index
+               # pages on our internal search engine that we don't want on
+               # Google. They should also be excluded from default search
+               # results (unless searching with a specific suburl)
+               u = urllib.urlopen("https://%s/sitemap_internal.xml" % self.hostname)
+               if u.getcode() == 200:
+                       p.parse(u, True)
+               u.close()
+
+               for url, prio, lastmod, internal in p.urls:
                        # Advance 8 characters - length of https://.
                        url = url[len(self.hostname)+8:]
                        if lastmod:
@@ -79,7 +90,7 @@ class SitemapSiteCrawler(BaseSiteCrawler):
                                                # to make sure we don't remove it...
                                                self.pages_crawled[url] = 1
                                                continue
-                       self.queue.put((url, prio))
+                       self.queue.put((url, prio, internal))
 
                log("About to crawl %s pages from sitemap" % self.queue.qsize())
 
index 83573259e2bf3c2a25cc59e45c4cc941c79b8c5d..2ed257331eeec1c069933abdb9092a40195a7059 100644 (file)
@@ -32,6 +32,7 @@ CREATE TABLE webpages (
    suburl varchar(512) NOT NULL,
    title varchar(128) NOT NULL,
    relprio float NOT NULL DEFAULT 0.5,
+   internal boolean NOT NULL DEFAULT 'f',
    lastscanned timestamptz NULL,
    txt text NOT NULL,
    fti tsvector NOT NULL