from lib.parsers import GenericHtmlParser, lossy_unicode
class BaseSiteCrawler(object):
- def __init__(self, hostname, dbconn, siteid, serverip=None):
+ def __init__(self, hostname, dbconn, siteid, serverip=None, https=False):
self.hostname = hostname
self.dbconn = dbconn
self.siteid = siteid
self.serverip = serverip
+ self.https = https
self.pages_crawled = {}
self.pages_new = 0
self.pages_updated = 0
def fetch_page(self, url):
try:
+ if not self.https:
+ port = 80
+ connclass = httplib.HTTPConnection
+ else:
+ port = 443
+ connclass = httplib.HTTPSConnection
+
# Unfortunatley, persistent connections seem quite unreliable,
# so create a new one for each page.
if self.serverip:
- h = httplib.HTTPConnection(host=self.serverip, port=80, strict=True, timeout=10)
+ h = connclass(host=self.serverip, port=port, strict=True, timeout=10)
h.putrequest("GET", url, skip_host=1)
h.putheader("Host", self.hostname)
else:
- h = httplib.HTTPConnection(host=self.hostname, port=80, strict=True, timeout=10)
+ h = connclass(host=self.hostname, port=port, strict=True, timeout=10)
h.putrequest("GET", url)
h.putheader("User-agent","pgsearch/0.2")
h.putheader("Connection","close")
self.currstr += data
class SitemapSiteCrawler(BaseSiteCrawler):
- def __init__(self, hostname, dbconn, siteid, serverip):
- super(SitemapSiteCrawler, self).__init__(hostname, dbconn, siteid, serverip)
+ def __init__(self, hostname, dbconn, siteid, serverip, https=False):
+ super(SitemapSiteCrawler, self).__init__(hostname, dbconn, siteid, serverip, https)
def init_crawl(self):
# Fetch the sitemap. We ignore robots.txt in this case, and
# Start by indexing the main website
log("Starting indexing of main website")
- SitemapSiteCrawler("www.postgresql.org", conn, 1, cp.get("search", "frontendip")).crawl()
+ SitemapSiteCrawler("www.postgresql.org", conn, 1, cp.get("search", "frontendip"), True).crawl()
conn.commit()
# Skip id=1, which is the main site..