def resolve_links(self, links, pageurl):
for x in links:
p = urlparse.urlsplit(x)
- if p.scheme == "http":
+ if p.scheme in ("http", "https"):
if p.netloc != self.hostname:
# Remote link
continue
from parsers import RobotsParser
class GenericSiteCrawler(BaseSiteCrawler):
- def __init__(self, hostname, dbconn, siteid):
- super(GenericSiteCrawler, self).__init__(hostname, dbconn, siteid)
+ def __init__(self, hostname, dbconn, siteid, https=False):
+ super(GenericSiteCrawler, self).__init__(hostname, dbconn, siteid, https=https)
def init_crawl(self):
# Load robots.txt
conn.commit()
# Skip id=1, which is the main site..
- curs.execute("SELECT id, hostname FROM sites WHERE id>1")
- for siteid, hostname in curs.fetchall():
+ curs.execute("SELECT id, hostname, https FROM sites WHERE id>1")
+ for siteid, hostname, https in curs.fetchall():
log("Starting indexing of %s" % hostname)
- GenericSiteCrawler(hostname, conn, siteid).crawl()
+ GenericSiteCrawler(hostname, conn, siteid, https).crawl()
conn.commit()
curs.execute("WITH t AS (SELECT site,count(*) AS c FROM webpages GROUP BY site) UPDATE sites SET pagecount=t.c FROM t WHERE id=t.site")
id int NOT NULL PRIMARY KEY,
hostname text NOT NULL UNIQUE,
description text NOT NULL,
+ https boolean NOT NULL DEFAULT 'f',
pagecount int NOT NULL
);