From d9744cba44fc1a556cddf7be59affc7a4c5c92dc Mon Sep 17 00:00:00 2001 From: Magnus Hagander Date: Sun, 2 Apr 2017 16:47:02 +0200 Subject: [PATCH] First stab at supporting https for generic sites Previously only the main website search supported it, which was less than great for community sites that are now https only. --- tools/search/crawler/lib/basecrawler.py | 2 +- tools/search/crawler/lib/genericsite.py | 4 ++-- tools/search/crawler/webcrawler.py | 6 +++--- tools/search/sql/schema.sql | 1 + 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tools/search/crawler/lib/basecrawler.py b/tools/search/crawler/lib/basecrawler.py index 331de09d..300600ef 100644 --- a/tools/search/crawler/lib/basecrawler.py +++ b/tools/search/crawler/lib/basecrawler.py @@ -232,7 +232,7 @@ class BaseSiteCrawler(object): def resolve_links(self, links, pageurl): for x in links: p = urlparse.urlsplit(x) - if p.scheme == "http": + if p.scheme in ("http", "https"): if p.netloc != self.hostname: # Remote link continue diff --git a/tools/search/crawler/lib/genericsite.py b/tools/search/crawler/lib/genericsite.py index 53775164..7e46ae5b 100644 --- a/tools/search/crawler/lib/genericsite.py +++ b/tools/search/crawler/lib/genericsite.py @@ -4,8 +4,8 @@ from basecrawler import BaseSiteCrawler from parsers import RobotsParser class GenericSiteCrawler(BaseSiteCrawler): - def __init__(self, hostname, dbconn, siteid): - super(GenericSiteCrawler, self).__init__(hostname, dbconn, siteid) + def __init__(self, hostname, dbconn, siteid, https=False): + super(GenericSiteCrawler, self).__init__(hostname, dbconn, siteid, https=https) def init_crawl(self): # Load robots.txt diff --git a/tools/search/crawler/webcrawler.py b/tools/search/crawler/webcrawler.py index fa9939ce..504bf8eb 100755 --- a/tools/search/crawler/webcrawler.py +++ b/tools/search/crawler/webcrawler.py @@ -22,10 +22,10 @@ def doit(): conn.commit() # Skip id=1, which is the main site.. - curs.execute("SELECT id, hostname FROM sites WHERE id>1") - for siteid, hostname in curs.fetchall(): + curs.execute("SELECT id, hostname, https FROM sites WHERE id>1") + for siteid, hostname, https in curs.fetchall(): log("Starting indexing of %s" % hostname) - GenericSiteCrawler(hostname, conn, siteid).crawl() + GenericSiteCrawler(hostname, conn, siteid, https).crawl() conn.commit() curs.execute("WITH t AS (SELECT site,count(*) AS c FROM webpages GROUP BY site) UPDATE sites SET pagecount=t.c FROM t WHERE id=t.site") diff --git a/tools/search/sql/schema.sql b/tools/search/sql/schema.sql index 0fcae492..ad8fca33 100644 --- a/tools/search/sql/schema.sql +++ b/tools/search/sql/schema.sql @@ -24,6 +24,7 @@ CREATE TABLE sites ( id int NOT NULL PRIMARY KEY, hostname text NOT NULL UNIQUE, description text NOT NULL, + https boolean NOT NULL DEFAULT 'f', pagecount int NOT NULL ); -- 2.39.5