Move a bunch of initialization to the parse step
authorMagnus Hagander <magnus@hagander.net>
Thu, 23 Mar 2017 15:39:03 +0000 (16:39 +0100)
committerMagnus Hagander <magnus@hagander.net>
Thu, 23 Mar 2017 15:39:03 +0000 (16:39 +0100)
We probably need to clean up more internal state in expat between the
calls.

tools/search/crawler/lib/sitemapsite.py

index 81c2f438960660983a2f3661a7004acce4f1a8bb..439bfbf39b7644ff5d4d8090820c2a2fc7c4ee20 100644 (file)
@@ -7,6 +7,9 @@ from lib.basecrawler import BaseSiteCrawler
 
 class SitemapParser(object):
        def __init__(self):
+               self.urls = []
+
+       def parse(self, f, internal=False):
                self.parser = xml.parsers.expat.ParserCreate()
                self.currenturl = ""
                self.currentprio = 0
@@ -16,9 +19,6 @@ class SitemapParser(object):
                self.getlastmod = False
                self.currstr = ""
                self.internal = False
-               self.urls = []
-
-       def parse(self, f, internal=False):
                self.parser.StartElementHandler = lambda name,attrs: self.processelement(name,attrs)
                self.parser.EndElementHandler = lambda name: self.processendelement(name)
                self.parser.CharacterDataHandler = lambda data: self.processcharacterdata(data)