From e7ec6be896342ac8beb3c9b961eaa38c9fdd1805 Mon Sep 17 00:00:00 2001 From: Magnus Hagander Date: Thu, 2 Sep 2010 14:35:21 +0200 Subject: [PATCH] If the given Last-Modified value is too far in the future, rewrite it to now(). Some blogs would return Last-Modified as 2038-, yet still honor the If-Modified-Since: 2038- header and thus never deliver any feeds at all.. --- aggregator.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/aggregator.py b/aggregator.py index 862e97f..ca397e5 100755 --- a/aggregator.py +++ b/aggregator.py @@ -5,7 +5,7 @@ This file contains the functions to suck down RSS/Atom feeds (using feedparser) and store the results in a PostgreSQL database. -Copyright (C) 2008-2009 PostgreSQL Global Development Group +Copyright (C) 2008-2010 PostgreSQL Global Development Group """ import psycopg2 @@ -98,7 +98,14 @@ class Aggregator: if hasattr(feed, 'modified') and feed['modified']: # Last-Modified header retreived. If we did receive it, we will # trust the content (assuming we can parse it) - self.db.cursor().execute("UPDATE planet.feeds SET lastget=%(date)s WHERE id=%(feed)s AND NOT lastget=%(date)s", { 'date': datetime.datetime(*feed['modified'][:6]), 'feed': feedinfo[0]}) + d = datetime.datetime(*feed['modified'][:6]) + if (d-datetime.datetime.now()).days > 5: + # Except if it's ridiculously long in the future, we'll set it + # to right now instead, to deal with buggy blog software. We + # currently define rediculously long as 5 days + d = datetime.datetime.now() + + self.db.cursor().execute("UPDATE planet.feeds SET lastget=%(date)s WHERE id=%(feed)s AND NOT lastget=%(date)s", { 'date': d, 'feed': feedinfo[0]}) else: # We didn't get a Last-Modified time, so set it to the entry date # for the latest entry in this feed. Only do this if we have more -- 2.39.5