First stab at supporting https for generic sites
authorMagnus Hagander <magnus@hagander.net>
Sun, 2 Apr 2017 14:47:02 +0000 (16:47 +0200)
committerMagnus Hagander <magnus@hagander.net>
Sun, 2 Apr 2017 14:47:02 +0000 (16:47 +0200)
Previously only the main website search supported it, which was less
than great for community sites that are now https only.

tools/search/crawler/lib/basecrawler.py
tools/search/crawler/lib/genericsite.py
tools/search/crawler/webcrawler.py
tools/search/sql/schema.sql

index 331de09dfdb68e032401a539bd1005cfdfc64712..300600efd507a514e80faf80b5bbbaafe83eef73 100644 (file)
@@ -232,7 +232,7 @@ class BaseSiteCrawler(object):
        def resolve_links(self, links, pageurl):
                for x in links:
                        p = urlparse.urlsplit(x)
-                       if p.scheme == "http":
+                       if p.scheme in ("http", "https"):
                                if p.netloc != self.hostname:
                                        # Remote link
                                        continue
index 5377516465c7b7687c90bf581639cdaf9b0484ec..7e46ae5beb5727fdca4aa223e7d5aa4ab06f1643 100644 (file)
@@ -4,8 +4,8 @@ from basecrawler import BaseSiteCrawler
 from parsers import RobotsParser
 
 class GenericSiteCrawler(BaseSiteCrawler):
-       def __init__(self, hostname, dbconn, siteid):
-               super(GenericSiteCrawler, self).__init__(hostname, dbconn, siteid)
+       def __init__(self, hostname, dbconn, siteid, https=False):
+               super(GenericSiteCrawler, self).__init__(hostname, dbconn, siteid, https=https)
 
        def init_crawl(self):
                # Load robots.txt
index fa9939ced51a3f8976a3f46f67f9c6db9c0af627..504bf8eb176699c035ba73d69b016c7a033feda3 100755 (executable)
@@ -22,10 +22,10 @@ def doit():
        conn.commit()
 
        # Skip id=1, which is the main site..
-       curs.execute("SELECT id, hostname FROM sites WHERE id>1")
-       for siteid, hostname in curs.fetchall():
+       curs.execute("SELECT id, hostname, https FROM sites WHERE id>1")
+       for siteid, hostname, https in curs.fetchall():
                log("Starting indexing of %s" % hostname)
-               GenericSiteCrawler(hostname, conn, siteid).crawl()
+               GenericSiteCrawler(hostname, conn, siteid, https).crawl()
                conn.commit()
 
        curs.execute("WITH t AS (SELECT site,count(*) AS c FROM webpages GROUP BY site) UPDATE sites SET pagecount=t.c FROM t WHERE id=t.site")
index 0fcae4927e74f099006523f6662a7cb8d027f840..ad8fca33489aff3d19e3b969cc1d48b64168e386 100644 (file)
@@ -24,6 +24,7 @@ CREATE TABLE sites (
    id int NOT NULL PRIMARY KEY,
    hostname text NOT NULL UNIQUE,
    description text NOT NULL,
+   https boolean NOT NULL DEFAULT 'f',
    pagecount int NOT NULL
 );