More https fixes for search crawler
authorMagnus Hagander <magnus@hagander.net>
Wed, 25 May 2016 13:43:54 +0000 (15:43 +0200)
committerMagnus Hagander <magnus@hagander.net>
Wed, 25 May 2016 13:43:54 +0000 (15:43 +0200)
tools/search/crawler/lib/basecrawler.py
tools/search/crawler/lib/sitemapsite.py
tools/search/crawler/webcrawler.py

index d4c5f3461bf813bfb31a2ceb088a172209ece63f..3dd2bb5d65c42f2fff6fc97d63334e5b7d53267f 100644 (file)
@@ -11,11 +11,12 @@ from lib.log import log
 from lib.parsers import GenericHtmlParser, lossy_unicode
 
 class BaseSiteCrawler(object):
-       def __init__(self, hostname, dbconn, siteid, serverip=None):
+       def __init__(self, hostname, dbconn, siteid, serverip=None, https=False):
                self.hostname = hostname
                self.dbconn = dbconn
                self.siteid = siteid
                self.serverip = serverip
+               self.https = https
                self.pages_crawled = {}
                self.pages_new = 0
                self.pages_updated = 0
@@ -162,14 +163,21 @@ class BaseSiteCrawler(object):
 
        def fetch_page(self, url):
                try:
+                       if not self.https:
+                               port = 80
+                               connclass = httplib.HTTPConnection
+                       else:
+                               port = 443
+                               connclass = httplib.HTTPSConnection
+
                        # Unfortunatley, persistent connections seem quite unreliable,
                        # so create a new one for each page.
                        if self.serverip:
-                               h = httplib.HTTPConnection(host=self.serverip, port=80, strict=True, timeout=10)
+                               h = connclass(host=self.serverip, port=port, strict=True, timeout=10)
                                h.putrequest("GET", url, skip_host=1)
                                h.putheader("Host", self.hostname)
                        else:
-                               h = httplib.HTTPConnection(host=self.hostname, port=80, strict=True, timeout=10)
+                               h = connclass(host=self.hostname, port=port, strict=True, timeout=10)
                                h.putrequest("GET", url)
                        h.putheader("User-agent","pgsearch/0.2")
                        h.putheader("Connection","close")
index 4534a456a23f093eb84ad6f26cfeded63c110f5e..3d0ebd1b94f7dfd68402755370fcc856f01f728f 100644 (file)
@@ -57,8 +57,8 @@ class SitemapParser(object):
                        self.currstr += data
 
 class SitemapSiteCrawler(BaseSiteCrawler):
-       def __init__(self, hostname, dbconn, siteid, serverip):
-               super(SitemapSiteCrawler, self).__init__(hostname, dbconn, siteid, serverip)
+       def __init__(self, hostname, dbconn, siteid, serverip, https=False):
+               super(SitemapSiteCrawler, self).__init__(hostname, dbconn, siteid, serverip, https)
 
        def init_crawl(self):
                # Fetch the sitemap. We ignore robots.txt in this case, and
index c09674217bfc2ebaedaf4431ac1115a0f0167e72..fa9939ced51a3f8976a3f46f67f9c6db9c0af627 100755 (executable)
@@ -18,7 +18,7 @@ def doit():
 
        # Start by indexing the main website
        log("Starting indexing of main website")
-       SitemapSiteCrawler("www.postgresql.org", conn, 1, cp.get("search", "frontendip")).crawl()
+       SitemapSiteCrawler("www.postgresql.org", conn, 1, cp.get("search", "frontendip"), True).crawl()
        conn.commit()
 
        # Skip id=1, which is the main site..