Don't generate http requests with two Host: headers
authorMagnus Hagander <magnus@hagander.net>
Tue, 7 Feb 2012 12:05:41 +0000 (13:05 +0100)
committerMagnus Hagander <magnus@hagander.net>
Tue, 7 Feb 2012 12:05:41 +0000 (13:05 +0100)
This would cause http 400 requests at the server, unsurprisingly.

tools/search/crawler/lib/basecrawler.py

index 915d73a2444cabc78635dfffc75f79f876315aa8..16c14fa8fe0807e5a920f61304d8ca14a14df2a1 100644 (file)
@@ -164,14 +164,14 @@ class BaseSiteCrawler(object):
                try:
                        # Unfortunatley, persistent connections seem quite unreliable,
                        # so create a new one for each page.
-                       h = httplib.HTTPConnection(host=self.serverip and self.serverip or self.hostname,
-                                                                          port=80,
-                                                                          strict=True,
-                                                                          timeout=10)
-                       h.putrequest("GET", url)
-                       h.putheader("User-agent","pgsearch/0.2")
                        if self.serverip:
+                               h = httplib.HTTPConnection(host=self.serverip, port=80, strict=True, timeout=10)
+                               h.putrequest("GET", url, skip_host=1)
                                h.putheader("Host", self.hostname)
+                       else:
+                               h = httplib.HTTPConnection(host=self.hostname, port=80, strict=True, timeout=10)
+                               h.putrequest("GET", url)
+                       h.putheader("User-agent","pgsearch/0.2")
                        h.putheader("Connection","close")
                        if self.scantimes.has_key(url):
                                h.putheader("If-Modified-Since", formatdate(time.mktime(self.scantimes[url].timetuple())))