From 428f299f48e4daf507cddfeb520e643c907b3227 Mon Sep 17 00:00:00 2001 From: Magnus Hagander Date: Thu, 23 Mar 2017 12:03:18 +0100 Subject: [PATCH] Generate internal sitemap including devel docs We'll use this to index some things in our own search engine without exposing it to external sitemap parsers. Not from a security standpoint of course, but something that will lead to it being possible to search the devel docs again. --- pgweb/core/views.py | 18 ++++++++++++++---- pgweb/docs/struct.py | 12 ++++++++++++ pgweb/urls.py | 1 + pgweb/util/sitestruct.py | 7 ++++--- 4 files changed, 31 insertions(+), 7 deletions(-) diff --git a/pgweb/core/views.py b/pgweb/core/views.py index a36e9792..35f55658 100644 --- a/pgweb/core/views.py +++ b/pgweb/core/views.py @@ -132,15 +132,13 @@ Sitemap: https://www.postgresql.org/sitemap.xml """, content_type='text/plain') -# Sitemap (XML format) -@cache(hours=6) -def sitemap(request): +def _make_sitemap(pagelist): resp = HttpResponse(content_type='text/xml') x = PgXmlHelper(resp) x.startDocument() x.startElement('urlset', {'xmlns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}) pages = 0 - for p in get_all_pages_struct(): + for p in pagelist: pages+=1 x.startElement('url', {}) x.add_xml_element('loc', 'https://www.postgresql.org/%s' % urllib.quote(p[0])) @@ -153,6 +151,18 @@ def sitemap(request): x.endDocument() return resp +# Sitemap (XML format) +@cache(hours=6) +def sitemap(request): + return _make_sitemap(get_all_pages_struct()) + +# Internal sitemap (only for our own search engine) +# Note! Still served up to anybody who wants it, so don't +# put anything secret in it... +@cache(hours=6) +def sitemap_internal(request): + return _make_sitemap(get_all_pages_struct(method='get_internal_struct')) + # dynamic CSS serving, meaning we merge a number of different CSS into a # single one, making sure it turns into a single http response. We do this # dynamically, since the output will be cached. diff --git a/pgweb/docs/struct.py b/pgweb/docs/struct.py index ce0b137e..d0325604 100644 --- a/pgweb/docs/struct.py +++ b/pgweb/docs/struct.py @@ -36,3 +36,15 @@ def get_struct(): if version == currentversion.tree: yield ('docs/current/static/%s' % filename, 1.0, loaded) + +# For our internal sitemap (used only by our own search engine), +# include the devel version of the docs (and only those, since the +# other versions are already included) +def get_internal_struct(): + curs = connection.cursor() + curs.execute("SELECT d.file, v.docsloaded FROM docs d INNER JOIN core_version v ON v.tree=d.version WHERE version = 0") + + for filename, loaded in curs.fetchall(): + yield ('docs/devel/static/%s' % (filename, ), + 0.1, + loaded) diff --git a/pgweb/urls.py b/pgweb/urls.py index e60df64e..904e54eb 100644 --- a/pgweb/urls.py +++ b/pgweb/urls.py @@ -81,6 +81,7 @@ urlpatterns = patterns('', # Sitemap (FIXME: support for >50k urls!) ### (r'^sitemap.xml', 'pgweb.core.views.sitemap'), + (r'^sitemap_internal.xml', 'pgweb.core.views.sitemap_internal'), ### # Workaround for broken links pushed in press release diff --git a/pgweb/util/sitestruct.py b/pgweb/util/sitestruct.py index 01153d80..be3cdec2 100644 --- a/pgweb/util/sitestruct.py +++ b/pgweb/util/sitestruct.py @@ -1,6 +1,6 @@ from django.conf import settings -def get_all_pages_struct(): +def get_all_pages_struct(method='get_struct'): """ Return an iterator over all distinct pages on the site. Each page is returned as a tuple consisting of: @@ -13,9 +13,10 @@ def get_all_pages_struct(): for app in settings.INSTALLED_APPS: if app.startswith('pgweb.'): try: - m = __import__(app+".struct", {}, {}, 'get_struct') + m = __import__(app+".struct", {}, {}, method) except: # Failed to import - probably module didnd't exist continue - for x in m.get_struct(): yield x + if hasattr(m, method): + for x in getattr(m, method)(): yield x -- 2.39.5