From f9486b54f1e01c29d1c842cf1caee0d539ebb8e6 Mon Sep 17 00:00:00 2001 From: Magnus Hagander Date: Tue, 7 Feb 2012 13:05:41 +0100 Subject: [PATCH] Don't generate http requests with two Host: headers This would cause http 400 requests at the server, unsurprisingly. --- tools/search/crawler/lib/basecrawler.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/search/crawler/lib/basecrawler.py b/tools/search/crawler/lib/basecrawler.py index 915d73a2..16c14fa8 100644 --- a/tools/search/crawler/lib/basecrawler.py +++ b/tools/search/crawler/lib/basecrawler.py @@ -164,14 +164,14 @@ class BaseSiteCrawler(object): try: # Unfortunatley, persistent connections seem quite unreliable, # so create a new one for each page. - h = httplib.HTTPConnection(host=self.serverip and self.serverip or self.hostname, - port=80, - strict=True, - timeout=10) - h.putrequest("GET", url) - h.putheader("User-agent","pgsearch/0.2") if self.serverip: + h = httplib.HTTPConnection(host=self.serverip, port=80, strict=True, timeout=10) + h.putrequest("GET", url, skip_host=1) h.putheader("Host", self.hostname) + else: + h = httplib.HTTPConnection(host=self.hostname, port=80, strict=True, timeout=10) + h.putrequest("GET", url) + h.putheader("User-agent","pgsearch/0.2") h.putheader("Connection","close") if self.scantimes.has_key(url): h.putheader("If-Modified-Since", formatdate(time.mktime(self.scantimes[url].timetuple()))) -- 2.39.5