From a7898accff342f94ea9a6cb1df1ce26367a9568f Mon Sep 17 00:00:00 2001 From: Michael Aquilina Date: Fri, 1 Aug 2014 21:44:08 +0200 Subject: [PATCH 001/100] Fix minor spelling mistake --- goose/outputformatters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/outputformatters.py b/goose/outputformatters.py index ae42457b..df6741d7 100644 --- a/goose/outputformatters.py +++ b/goose/outputformatters.py @@ -47,7 +47,7 @@ def get_language(self): Returns the language is by the article or the configuration language """ - # we don't want to force the target laguage + # we don't want to force the target language # so we use the article.meta_lang if self.config.use_meta_language == True: if self.article.meta_lang: From b6a54f9047ee9bdfb2daa702cb54ad1cde5388d9 Mon Sep 17 00:00:00 2001 From: Michael Aquilina Date: Fri, 1 Aug 2014 21:44:23 +0200 Subject: [PATCH 002/100] Use PEP8 convention for boolean statements --- goose/outputformatters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/outputformatters.py b/goose/outputformatters.py index df6741d7..1f8ba4bd 100644 --- a/goose/outputformatters.py +++ b/goose/outputformatters.py @@ -49,7 +49,7 @@ def get_language(self): """ # we don't want to force the target language # so we use the article.meta_lang - if self.config.use_meta_language == True: + if self.config.use_meta_language: if self.article.meta_lang: return self.article.meta_lang[:2] return self.config.target_language From 2916126acbac794066677e4559a622f5c8e3a395 Mon Sep 17 00:00:00 2001 From: Michael Aquilina Date: Fri, 1 Aug 2014 22:15:30 +0200 Subject: [PATCH 003/100] Do not fail when stopword list is not available for a certain language --- goose/text.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/goose/text.py b/goose/text.py index 4008d62b..dd4fb701 100644 --- a/goose/text.py +++ b/goose/text.py @@ -95,7 +95,12 @@ def __init__(self, language='en'): # to generate dynamic path for file to load if not language in self._cached_stop_words: path = os.path.join('text', 'stopwords-%s.txt' % language) - self._cached_stop_words[language] = set(FileHelper.loadResourceFile(path).splitlines()) + try: + content = FileHelper.loadResourceFile(path) + word_list = content.splitlines() + except IOError: + word_list = [] + self._cached_stop_words[language] = set(word_list) self.STOP_WORDS = self._cached_stop_words[language] def remove_punctuation(self, content): From b9330040eb7b6cb7bef2dbc4dcf2365a9fdfa4b5 Mon Sep 17 00:00:00 2001 From: Michael Aquilina Date: Sun, 3 Aug 2014 20:09:08 +0200 Subject: [PATCH 004/100] Fix minor spelling error "handling" --- goose/crawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/goose/crawler.py b/goose/crawler.py index 211d410e..77dcb535 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -117,10 +117,10 @@ def crawl(self, crawl_candidate): # let's process it if self.article.top_node is not None: - # video handeling + # video handling self.video_extractor.get_videos() - # image handeling + # image handling if self.config.enable_image_fetching: self.get_image() From c31c9c4d992561f882f019d20ea790594aea3a67 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Sun, 28 Dec 2014 23:30:42 +0100 Subject: [PATCH 005/100] #157 - add test case files --- tests/data/extractors/test_opengraphcontent.html | 15 +++++++++++++++ tests/data/extractors/test_opengraphcontent.json | 6 ++++++ 2 files changed, 21 insertions(+) create mode 100644 tests/data/extractors/test_opengraphcontent.html create mode 100644 tests/data/extractors/test_opengraphcontent.json diff --git a/tests/data/extractors/test_opengraphcontent.html b/tests/data/extractors/test_opengraphcontent.html new file mode 100644 index 00000000..46e5c9de --- /dev/null +++ b/tests/data/extractors/test_opengraphcontent.html @@ -0,0 +1,15 @@ + + +
+

+ Not an Actual Content + TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

+
+
+

+ Search-and-rescue teams were mobilized from across Southeast Asia on Sunday after a commercial airliner with 162 people on board lost contact with ground controllers off the coast of Borneo, a search effort that evoked a distressingly familiar mix of grief and mystery nine months after a Malaysia Airlines jetliner disappeared over the Indian Ocean. +

+
+ + diff --git a/tests/data/extractors/test_opengraphcontent.json b/tests/data/extractors/test_opengraphcontent.json new file mode 100644 index 00000000..a775091d --- /dev/null +++ b/tests/data/extractors/test_opengraphcontent.json @@ -0,0 +1,6 @@ +{ + "url": "http://exemple.com/test_opengraphcontent", + "expected": { + "cleaned_text": "Search-and-rescue teams were mobilized from across Southeast Asia on Sunday after a commercial airliner with 162 people on board lost contact with ground controllers off the coast of Borneo, a search effort that evoked a distressingly familiar mix of grief and mystery nine months after a Malaysia Airlines jetliner disappeared over the Indian Ocean." + } +} From 5ac4a32e010308d209b8534fed824ae7d5683e98 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Sun, 28 Dec 2014 23:31:26 +0100 Subject: [PATCH 006/100] #157 - remove childnode one by one to keep parent node --- goose/cleaners.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/goose/cleaners.py b/goose/cleaners.py index 2e8bc87a..c1384ee0 100644 --- a/goose/cleaners.py +++ b/goose/cleaners.py @@ -246,7 +246,8 @@ def div_to_para(self, doc, dom_type): bad_divs += 1 elif div is not None: replaceNodes = self.get_replacement_nodes(doc, div) - div.clear() + for child in self.parser.childNodes(div): + div.remove(child) for c, n in enumerate(replaceNodes): div.insert(c, n) From 71f1deccfa836c6494ae7b35569dd86e4a77eff9 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Sun, 28 Dec 2014 23:32:42 +0100 Subject: [PATCH 007/100] #157 - hanbdle schema.org microdata --- goose/extractors.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/goose/extractors.py b/goose/extractors.py index 1c8a37f1..44db6b5e 100644 --- a/goose/extractors.py +++ b/goose/extractors.py @@ -41,6 +41,11 @@ RE_LANG = r'^[A-Za-z]{2}$' +KNOWN_CONTENT_TAGS = [ + {'attribute': 'itemprop', 'value': 'articleBody'} +] + + class ContentExtractor(object): def __init__(self, config, article): @@ -231,6 +236,11 @@ def extract_tags(self): return set(tags) def calculate_best_node(self): + + top_node_from_known_tags = self.get_top_node_from_known_tags() + if top_node_from_known_tags is not None: + return top_node_from_known_tags + doc = self.article.doc top_node = None nodes_to_check = self.nodes_to_check(doc) @@ -303,6 +313,22 @@ def calculate_best_node(self): return top_node + def is_known_tags_element(self, node): + for tag in KNOWN_CONTENT_TAGS: + if self.parser.getAttribute(node, tag['attribute']) == tag['value']: + return True + return False + + def get_top_node_from_known_tags(self): + for known_content_tag in KNOWN_CONTENT_TAGS: + content_tags = self.parser.getElementsByTag(self.article.doc, + attr=known_content_tag['attribute'], + value=known_content_tag['value']) + if len(content_tags): + top_node = content_tags[0] + self.parser.setAttribute(top_node, "extraction", "microDataExtration") + return content_tags[0] + def is_boostable(self, node): """\ alot of times the first paragraph might be the caption under an image @@ -341,8 +367,13 @@ def walk_siblings(self, node): return b def add_siblings(self, top_node): + # in case the extraction used known attributes + # we don't want to add sibilings + if self.is_known_tags_element(top_node): + return top_node baselinescore_siblings_para = self.get_siblings_score(top_node) results = self.walk_siblings(top_node) + print results for current_node in results: ps = self.get_siblings_content(current_node, baselinescore_siblings_para) for p in ps: From 6215fface3064248a6b3eaf5a2cd9835e818fac6 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Sun, 28 Dec 2014 23:34:28 +0100 Subject: [PATCH 008/100] #157 - add test case --- tests/extractors.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/extractors.py b/tests/extractors.py index 84ba0502..a3e0dccb 100644 --- a/tests/extractors.py +++ b/tests/extractors.py @@ -355,6 +355,11 @@ def test_okaymarketing(self): fields = ['cleaned_text'] self.runArticleAssertions(article=article, fields=fields) + def test_opengraphcontent(self): + article = self.getArticle() + fields = ['cleaned_text'] + self.runArticleAssertions(article=article, fields=fields) + class TestExtractWithUrl(TestExtractionBase): From b8991df12db180d1b46919caab2eea643670b232 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Sun, 28 Dec 2014 23:35:40 +0100 Subject: [PATCH 009/100] #157 - remove print --- goose/extractors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/goose/extractors.py b/goose/extractors.py index 44db6b5e..33cdf0b1 100644 --- a/goose/extractors.py +++ b/goose/extractors.py @@ -373,7 +373,6 @@ def add_siblings(self, top_node): return top_node baselinescore_siblings_para = self.get_siblings_score(top_node) results = self.walk_siblings(top_node) - print results for current_node in results: ps = self.get_siblings_content(current_node, baselinescore_siblings_para) for p in ps: From 9379cd83c1cdfe0468b792c3ba09c22e15c2af93 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 00:17:30 +0100 Subject: [PATCH 010/100] #157 - corrected content with microdata --- tests/data/extractors/test_lefigaro.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/data/extractors/test_lefigaro.json b/tests/data/extractors/test_lefigaro.json index 311f4455..19f655ec 100644 --- a/tests/data/extractors/test_lefigaro.json +++ b/tests/data/extractors/test_lefigaro.json @@ -6,7 +6,7 @@ "domain": "www.lefigaro.fr", "final_url": "http://www.lefigaro.fr/conjoncture/2013/04/05/20002-20130405ARTFIG00473-montebourg-envisage-des-privatisations-partielles.php", "meta_keywords": "Actualit\u00e9 \u00e9conomique, entreprises, \u00e9conomie, bourse, emploi, imp\u00f4ts, cac 40, creation d'entreprise, chef d'entreprise, grands patrons, consommation, multinationales, privatisation, d\u00e9localisations, concurrence, monopole, crise, bourse, licenciements, union europ\u00e9enne, etats-unis, chine, pmi, pme, tpe, salaires, relance, pib, pnb, aides sociales, japon, r\u00e9cession, \u00e9conomie verte, fmi, reprise, croissance, news, actu", - "cleaned_text": "Selon le ministre du Redressement productif interview\u00e9 par le Wall Street Journal, le gouvernement", + "cleaned_text": "«Dans le cadre de l'effort de restructuration budgétaire", "tags": [ "EDF", "Privatisation", @@ -19,4 +19,4 @@ "meta_favicon": "http://www.lefigaro.fr/icones/favicon.ico", "meta_lang": null } -} \ No newline at end of file +} From f28a6e7ad4fd65b26df4556a0fee15dec384b0b3 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 00:26:54 +0100 Subject: [PATCH 011/100] #157 - refactor --- goose/extractors.py | 50 ++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/goose/extractors.py b/goose/extractors.py index 33cdf0b1..f477c940 100644 --- a/goose/extractors.py +++ b/goose/extractors.py @@ -41,11 +41,6 @@ RE_LANG = r'^[A-Za-z]{2}$' -KNOWN_CONTENT_TAGS = [ - {'attribute': 'itemprop', 'value': 'articleBody'} -] - - class ContentExtractor(object): def __init__(self, config, article): @@ -214,6 +209,22 @@ def get_domain(self): return o.hostname return None + def get_articlebody(self): + article_body = self.parser.getElementsByTag( + self.article.doc, + attr='itemprop', + value='articleBody') + if len(article_body): + article_body = article_body[0] + self.parser.setAttribute(article_body, "extraction", "microDataExtration") + return article_body + return None + + def is_articlebody(self, node): + if self.parser.getAttribute(node, 'itemprop') == 'articleBody': + return True + return False + def extract_tags(self): node = self.article.doc @@ -237,10 +248,6 @@ def extract_tags(self): def calculate_best_node(self): - top_node_from_known_tags = self.get_top_node_from_known_tags() - if top_node_from_known_tags is not None: - return top_node_from_known_tags - doc = self.article.doc top_node = None nodes_to_check = self.nodes_to_check(doc) @@ -313,22 +320,6 @@ def calculate_best_node(self): return top_node - def is_known_tags_element(self, node): - for tag in KNOWN_CONTENT_TAGS: - if self.parser.getAttribute(node, tag['attribute']) == tag['value']: - return True - return False - - def get_top_node_from_known_tags(self): - for known_content_tag in KNOWN_CONTENT_TAGS: - content_tags = self.parser.getElementsByTag(self.article.doc, - attr=known_content_tag['attribute'], - value=known_content_tag['value']) - if len(content_tags): - top_node = content_tags[0] - self.parser.setAttribute(top_node, "extraction", "microDataExtration") - return content_tags[0] - def is_boostable(self, node): """\ alot of times the first paragraph might be the caption under an image @@ -369,7 +360,7 @@ def walk_siblings(self, node): def add_siblings(self, top_node): # in case the extraction used known attributes # we don't want to add sibilings - if self.is_known_tags_element(top_node): + if self.is_articlebody(top_node): return top_node baselinescore_siblings_para = self.get_siblings_score(top_node) results = self.walk_siblings(top_node) @@ -508,6 +499,13 @@ def nodes_to_check(self, doc): on like paragraphs and tables """ nodes_to_check = [] + + # microdata + # set the most score to articleBody node + article_body_node = self.get_articlebody() + if article_body_node is not None: + self.update_score(article_body_node, 99) + for tag in ['p', 'pre', 'td']: items = self.parser.getElementsByTag(doc, tag=tag) nodes_to_check += items From ced075f3458318a2bfd30f4392cc221eaa4a4862 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 01:02:21 +0100 Subject: [PATCH 012/100] #160 - fail silently for unknown images --- goose/images/utils.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/goose/images/utils.py b/goose/images/utils.py index 2767416f..a70c6b5d 100644 --- a/goose/images/utils.py +++ b/goose/images/utils.py @@ -33,12 +33,15 @@ class ImageUtils(object): @classmethod def get_image_dimensions(self, identify_program, path): - image = Image.open(path) image_details = ImageDetails() - image_details.set_mime_type(image.format) - width, height = image.size - image_details.set_width(width) - image_details.set_height(height) + try: + image = Image.open(path) + image_details.set_mime_type(image.format) + width, height = image.size + image_details.set_width(width) + image_details.set_height(height) + except IOError: + image_details.set_mime_type('NA') return image_details @classmethod From 048bcdbe893399cf0b82e6ddd970f023d3faba35 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 01:24:42 +0100 Subject: [PATCH 013/100] #161 - add parser list variable --- goose/configuration.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/goose/configuration.py b/goose/configuration.py index 42696f58..259c3e90 100644 --- a/goose/configuration.py +++ b/goose/configuration.py @@ -29,6 +29,11 @@ HTTP_DEFAULT_TIMEOUT = 30 +AVAILABLE_PARSERS = { + 'lxml' : Parser, + 'soupparser': ParserSoup, +} + class Configuration(object): @@ -84,6 +89,7 @@ def __init__(self): self.additional_data_extractor = None # Parser type + self.available_parsers = AVAILABLE_PARSERS.keys() self.parser_class = 'lxml' # set the local storage path @@ -94,7 +100,7 @@ def __init__(self): self.http_timeout = HTTP_DEFAULT_TIMEOUT def get_parser(self): - return Parser if self.parser_class == 'lxml' else ParserSoup + return AVAILABLE_PARSERS[self.parser_class] def get_publishdate_extractor(self): return self.extract_publishdate From eaaa60ae7a5c139a44f84395fed0e0cfb67556eb Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 01:25:05 +0100 Subject: [PATCH 014/100] #161 - parser fallback --- goose/__init__.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/goose/__init__.py b/goose/__init__.py index 885dc6e5..49073bd1 100644 --- a/goose/__init__.py +++ b/goose/__init__.py @@ -59,8 +59,14 @@ def shutdown_network(self): pass def crawl(self, crawl_candiate): - crawler = Crawler(self.config) - article = crawler.crawl(crawl_candiate) + parsers = self.config.available_parsers + parsers.remove(self.config.parser_class) + try: + crawler = Crawler(self.config) + article = crawler.crawl(crawl_candiate) + except (UnicodeDecodeError, ValueError): + self.config.parser_class = parsers[0] + return self.crawl(crawl_candiate) return article def initialize(self): From f6647fc49a66c9c7c2ba2c09b2e2baf8c1acd831 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 01:54:52 +0100 Subject: [PATCH 015/100] Merge pull request #5 from cronycle/feature/4-publish-date feature(extractors/publish_date): Extract publish date from meta tags. Conflicts: tests/extractors.py --- goose/configuration.py | 12 ------------ goose/crawler.py | 3 +-- goose/extractors.py | 14 ++++++++++++++ tests/data/extractors/test_publish_date.html | 7 +++++++ tests/data/extractors/test_publish_date.json | 6 ++++++ .../extractors/test_publish_date_article.html | 7 +++++++ .../extractors/test_publish_date_article.json | 6 ++++++ .../data/extractors/test_publish_date_rnews.html | 7 +++++++ .../data/extractors/test_publish_date_rnews.json | 6 ++++++ tests/extractors.py | 15 +++++++++++++++ 10 files changed, 69 insertions(+), 14 deletions(-) create mode 100644 tests/data/extractors/test_publish_date.html create mode 100644 tests/data/extractors/test_publish_date.json create mode 100644 tests/data/extractors/test_publish_date_article.html create mode 100644 tests/data/extractors/test_publish_date_article.json create mode 100644 tests/data/extractors/test_publish_date_rnews.html create mode 100644 tests/data/extractors/test_publish_date_rnews.json diff --git a/goose/configuration.py b/goose/configuration.py index 259c3e90..fe26b22a 100644 --- a/goose/configuration.py +++ b/goose/configuration.py @@ -102,18 +102,6 @@ def __init__(self): def get_parser(self): return AVAILABLE_PARSERS[self.parser_class] - def get_publishdate_extractor(self): - return self.extract_publishdate - - def set_publishdate_extractor(self, extractor): - """\ - Pass in to extract article publish dates. - @param extractor a concrete instance of PublishDateExtractor - """ - if not extractor: - raise ValueError("extractor must not be null!") - self.extract_publishdate = extractor - def get_additionaldata_extractor(self): return self.additional_data_extractor diff --git a/goose/crawler.py b/goose/crawler.py index 211d410e..192429f1 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -95,8 +95,7 @@ def crawl(self, crawl_candidate): self.article.raw_html = raw_html self.article.doc = doc self.article.raw_doc = deepcopy(doc) - # TODO - # self.article.publish_date = config.publishDateExtractor.extract(doc) + self.article.publish_date = self.extractor.get_publish_date() # self.article.additional_data = config.get_additionaldata_extractor.extract(doc) self.article.title = self.extractor.get_title() self.article.meta_lang = self.extractor.get_meta_lang() diff --git a/goose/extractors.py b/goose/extractors.py index f477c940..8b0146ce 100644 --- a/goose/extractors.py +++ b/goose/extractors.py @@ -39,6 +39,11 @@ A_REL_TAG_SELECTOR = "a[rel=tag]" A_HREF_TAG_SELECTOR = "a[href*='/tag/'], a[href*='/tags/'], a[href*='/topic/'], a[href*='?keyword=']" RE_LANG = r'^[A-Za-z]{2}$' +KNOWN_PUBLISH_DATE_META_TAGS = [ + {'attribute': 'property', 'value': 'rnews:datePublished'}, + {'attribute': 'property', 'value': 'article:published_time'}, + {'attribute': 'name', 'value': 'OriginalPublicationDate'}, +] class ContentExtractor(object): @@ -118,6 +123,15 @@ def split_title(self, title, splitter): title = title_pieces[large_text_index] return TITLE_REPLACEMENTS.replaceAll(title).strip() + def get_publish_date(self): + for known_meta_tag in KNOWN_PUBLISH_DATE_META_TAGS: + meta_tags = self.parser.getElementsByTag(self.article.doc, + tag='meta', + attr=known_meta_tag['attribute'], + value=known_meta_tag['value']) + if meta_tags: + return self.parser.getAttribute(meta_tags[0], attr='content') + def get_favicon(self): """\ Extract the favicon from a website diff --git a/tests/data/extractors/test_publish_date.html b/tests/data/extractors/test_publish_date.html new file mode 100644 index 00000000..6ce2b927 --- /dev/null +++ b/tests/data/extractors/test_publish_date.html @@ -0,0 +1,7 @@ + + + + + + + diff --git a/tests/data/extractors/test_publish_date.json b/tests/data/extractors/test_publish_date.json new file mode 100644 index 00000000..a37e1173 --- /dev/null +++ b/tests/data/extractors/test_publish_date.json @@ -0,0 +1,6 @@ +{ + "url": "http://example.com/example", + "expected": { + "publish_date": "2014-06-30T16:54:02+00:00" + } +} diff --git a/tests/data/extractors/test_publish_date_article.html b/tests/data/extractors/test_publish_date_article.html new file mode 100644 index 00000000..3d03667e --- /dev/null +++ b/tests/data/extractors/test_publish_date_article.html @@ -0,0 +1,7 @@ + + + + + + + diff --git a/tests/data/extractors/test_publish_date_article.json b/tests/data/extractors/test_publish_date_article.json new file mode 100644 index 00000000..06f14aa6 --- /dev/null +++ b/tests/data/extractors/test_publish_date_article.json @@ -0,0 +1,6 @@ +{ + "url": "http://example.com/example", + "expected": { + "publish_date": "2012-01-11T15:55:01+00:00" + } +} diff --git a/tests/data/extractors/test_publish_date_rnews.html b/tests/data/extractors/test_publish_date_rnews.html new file mode 100644 index 00000000..ca71f718 --- /dev/null +++ b/tests/data/extractors/test_publish_date_rnews.html @@ -0,0 +1,7 @@ + + + + + + + diff --git a/tests/data/extractors/test_publish_date_rnews.json b/tests/data/extractors/test_publish_date_rnews.json new file mode 100644 index 00000000..623b13bb --- /dev/null +++ b/tests/data/extractors/test_publish_date_rnews.json @@ -0,0 +1,6 @@ +{ + "url": "http://example.com/example", + "expected": { + "publish_date": "2010-02-22T11:53:04+00:00" + } +} diff --git a/tests/extractors.py b/tests/extractors.py index a3e0dccb..83907ac3 100644 --- a/tests/extractors.py +++ b/tests/extractors.py @@ -361,6 +361,21 @@ def test_opengraphcontent(self): self.runArticleAssertions(article=article, fields=fields) +class TestPublishDate(TestExtractionBase): + + def test_publish_date(self): + article = self.getArticle() + self.runArticleAssertions(article=article, fields=['publish_date']) + + def test_publish_date_rnews(self): + article = self.getArticle() + self.runArticleAssertions(article=article, fields=['publish_date']) + + def test_publish_date_article(self): + article = self.getArticle() + self.runArticleAssertions(article=article, fields=['publish_date']) + + class TestExtractWithUrl(TestExtractionBase): def test_get_canonical_url(self): From 2498065a5d017a0cea101ae4771cab7d0e00d1fa Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 02:11:01 +0100 Subject: [PATCH 016/100] #163 - add schema published date parsing test --- .../data/extractors/test_publish_date_schema.html | 15 +++++++++++++++ .../data/extractors/test_publish_date_schema.json | 6 ++++++ tests/extractors.py | 4 ++++ 3 files changed, 25 insertions(+) create mode 100644 tests/data/extractors/test_publish_date_schema.html create mode 100644 tests/data/extractors/test_publish_date_schema.json diff --git a/tests/data/extractors/test_publish_date_schema.html b/tests/data/extractors/test_publish_date_schema.html new file mode 100644 index 00000000..8a666dfa --- /dev/null +++ b/tests/data/extractors/test_publish_date_schema.html @@ -0,0 +1,15 @@ + + + + test video + + + +
+ +

+ TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

+
+ + diff --git a/tests/data/extractors/test_publish_date_schema.json b/tests/data/extractors/test_publish_date_schema.json new file mode 100644 index 00000000..8e150921 --- /dev/null +++ b/tests/data/extractors/test_publish_date_schema.json @@ -0,0 +1,6 @@ +{ + "url": "http://example.com/example", + "expected": { + "publish_date": "2014-10-09T12:06:16" + } +} diff --git a/tests/extractors.py b/tests/extractors.py index 83907ac3..cfcc4bfa 100644 --- a/tests/extractors.py +++ b/tests/extractors.py @@ -375,6 +375,10 @@ def test_publish_date_article(self): article = self.getArticle() self.runArticleAssertions(article=article, fields=['publish_date']) + def test_publish_date_schema(self): + article = self.getArticle() + self.runArticleAssertions(article=article, fields=['publish_date']) + class TestExtractWithUrl(TestExtractionBase): From 5910f39a29161ad79032d4dd06d3410840873091 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 02:11:36 +0100 Subject: [PATCH 017/100] #163 - do not use only meta for publication date --- goose/extractors.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/goose/extractors.py b/goose/extractors.py index 8b0146ce..0163fbd0 100644 --- a/goose/extractors.py +++ b/goose/extractors.py @@ -39,10 +39,11 @@ A_REL_TAG_SELECTOR = "a[rel=tag]" A_HREF_TAG_SELECTOR = "a[href*='/tag/'], a[href*='/tags/'], a[href*='/topic/'], a[href*='?keyword=']" RE_LANG = r'^[A-Za-z]{2}$' -KNOWN_PUBLISH_DATE_META_TAGS = [ - {'attribute': 'property', 'value': 'rnews:datePublished'}, - {'attribute': 'property', 'value': 'article:published_time'}, - {'attribute': 'name', 'value': 'OriginalPublicationDate'}, +KNOWN_PUBLISH_DATE_TAGS = [ + {'attribute': 'property', 'value': 'rnews:datePublished', 'content': 'content'}, + {'attribute': 'property', 'value': 'article:published_time', 'content': 'content'}, + {'attribute': 'name', 'value': 'OriginalPublicationDate', 'content': 'content'}, + {'attribute': 'itemprop', 'value': 'datePublished', 'content': 'datetime'}, ] @@ -124,13 +125,12 @@ def split_title(self, title, splitter): return TITLE_REPLACEMENTS.replaceAll(title).strip() def get_publish_date(self): - for known_meta_tag in KNOWN_PUBLISH_DATE_META_TAGS: + for known_meta_tag in KNOWN_PUBLISH_DATE_TAGS: meta_tags = self.parser.getElementsByTag(self.article.doc, - tag='meta', attr=known_meta_tag['attribute'], value=known_meta_tag['value']) if meta_tags: - return self.parser.getAttribute(meta_tags[0], attr='content') + return self.parser.getAttribute(meta_tags[0], known_meta_tag['content']) def get_favicon(self): """\ From f8fc13dcf1d1b8a170a47dffb6d87fb35f45b68e Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 02:41:56 +0100 Subject: [PATCH 018/100] #165 - add opengraph property to article --- goose/article.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/goose/article.py b/goose/article.py index d195f166..fbfc1478 100644 --- a/goose/article.py +++ b/goose/article.py @@ -64,6 +64,9 @@ def __init__(self): # been in the artcle, these are not meta keywords self.tags = set() + # holds a dict of all opengrah data found + self.opengraph = {} + # holds a list of any movies # we found on the page like youtube, vimeo self.movies = [] From a27cfffdb7056a1b3f3fe1327c31db27de467bb2 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 02:42:18 +0100 Subject: [PATCH 019/100] #165 - extract opengraph data --- goose/crawler.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/goose/crawler.py b/goose/crawler.py index 192429f1..64868986 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -106,6 +106,9 @@ def crawl(self, crawl_candidate): self.article.domain = self.extractor.get_domain() self.article.tags = self.extractor.extract_tags() + # opengraph + self.article.opengraph = self.extractor.extract_opengraph() + # before we do any calcs on the body itself let's clean up the document self.article.doc = self.cleaner.clean() From 101e69c81cfdd1ef0549b72268565e49caa334de Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 02:43:31 +0100 Subject: [PATCH 020/100] #165 - opengraph extractor --- goose/extractors.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/goose/extractors.py b/goose/extractors.py index 0163fbd0..7356b669 100644 --- a/goose/extractors.py +++ b/goose/extractors.py @@ -239,6 +239,17 @@ def is_articlebody(self, node): return True return False + def extract_opengraph(self): + opengraph = {} + node = self.article.doc + metas = self.parser.getElementsByTag(node, 'meta') + for meta in metas: + attr = self.parser.getAttribute(meta, 'property') + if attr is not None and attr.startswith("og:"): + value = self.parser.getAttribute(meta, 'content') + opengraph.update({attr.split(":")[1]: value}) + return opengraph + def extract_tags(self): node = self.article.doc From eb1274b84833946cea69e87b7bcc651be4487a97 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 02:52:37 +0100 Subject: [PATCH 021/100] #165 - rename dict --- goose/extractors.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/goose/extractors.py b/goose/extractors.py index 7356b669..0b1ff636 100644 --- a/goose/extractors.py +++ b/goose/extractors.py @@ -240,15 +240,15 @@ def is_articlebody(self, node): return False def extract_opengraph(self): - opengraph = {} + opengraph_dict = {} node = self.article.doc metas = self.parser.getElementsByTag(node, 'meta') for meta in metas: attr = self.parser.getAttribute(meta, 'property') if attr is not None and attr.startswith("og:"): value = self.parser.getAttribute(meta, 'content') - opengraph.update({attr.split(":")[1]: value}) - return opengraph + opengraph_dict.update({attr.split(":")[1]: value}) + return opengraph_dict def extract_tags(self): node = self.article.doc From c2eb34efa8819f9ab974a491bcf378163762c674 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 02:53:09 +0100 Subject: [PATCH 022/100] #165 - opengraph extraction test --- tests/data/extractors/test_opengraph.html | 16 ++++++++++++++++ tests/data/extractors/test_opengraph.json | 12 ++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 tests/data/extractors/test_opengraph.html create mode 100644 tests/data/extractors/test_opengraph.json diff --git a/tests/data/extractors/test_opengraph.html b/tests/data/extractors/test_opengraph.html new file mode 100644 index 00000000..bcc8cbb8 --- /dev/null +++ b/tests/data/extractors/test_opengraph.html @@ -0,0 +1,16 @@ + + + + + + + + + +
+

+ TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

+
+ + diff --git a/tests/data/extractors/test_opengraph.json b/tests/data/extractors/test_opengraph.json new file mode 100644 index 00000000..ba05d768 --- /dev/null +++ b/tests/data/extractors/test_opengraph.json @@ -0,0 +1,12 @@ +{ + "url": "http://exemple.com/test_opengraphcontent", + "expected": { + "opengraph": { + "url": "http://www.somenews.com/2012/09/19/nyregion/some-news-article.html?pagewanted=all", + "image": "http://graphics8.somenews.com/images/2012/09/19/region/some-news-image.jpg", + "type": "article", + "description": "Some News Happened in New York", + "title": "Some News Article Story" + } + } +} From 6bbe2db44fad4a05acc5859709b48b0ccd1e9ef5 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 02:54:33 +0100 Subject: [PATCH 023/100] #165 - rename article body extraction test --- .../{test_opengraphcontent.html => test_articlebody.html} | 0 .../{test_opengraphcontent.json => test_articlebody.json} | 0 tests/extractors.py | 7 ++++++- 3 files changed, 6 insertions(+), 1 deletion(-) rename tests/data/extractors/{test_opengraphcontent.html => test_articlebody.html} (100%) rename tests/data/extractors/{test_opengraphcontent.json => test_articlebody.json} (100%) diff --git a/tests/data/extractors/test_opengraphcontent.html b/tests/data/extractors/test_articlebody.html similarity index 100% rename from tests/data/extractors/test_opengraphcontent.html rename to tests/data/extractors/test_articlebody.html diff --git a/tests/data/extractors/test_opengraphcontent.json b/tests/data/extractors/test_articlebody.json similarity index 100% rename from tests/data/extractors/test_opengraphcontent.json rename to tests/data/extractors/test_articlebody.json diff --git a/tests/extractors.py b/tests/extractors.py index cfcc4bfa..eba69def 100644 --- a/tests/extractors.py +++ b/tests/extractors.py @@ -355,11 +355,16 @@ def test_okaymarketing(self): fields = ['cleaned_text'] self.runArticleAssertions(article=article, fields=fields) - def test_opengraphcontent(self): + def test_articlebody(self): article = self.getArticle() fields = ['cleaned_text'] self.runArticleAssertions(article=article, fields=fields) + def test_opengraph(self): + article = self.getArticle() + fields = ['opengraph'] + self.runArticleAssertions(article=article, fields=fields) + class TestPublishDate(TestExtractionBase): From 124371ed0b25e5ff2cf95c5a6062abfd01c132d7 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 03:25:40 +0100 Subject: [PATCH 024/100] #139 - article links property --- goose/article.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/goose/article.py b/goose/article.py index fbfc1478..b8645f3d 100644 --- a/goose/article.py +++ b/goose/article.py @@ -71,6 +71,9 @@ def __init__(self): # we found on the page like youtube, vimeo self.movies = [] + # holds links found in the main article + self.links = [] + # stores the final URL that we're going to try # and fetch content against, this would be expanded if any self.final_url = u"" From 4adf4bceab9378f891cf083191336544738476aa Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 03:26:06 +0100 Subject: [PATCH 025/100] #139 - extract article links --- goose/crawler.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/goose/crawler.py b/goose/crawler.py index 64868986..e23a72e9 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -119,6 +119,9 @@ def crawl(self, crawl_candidate): # let's process it if self.article.top_node is not None: + # article links + self.article.links = self.extractor.extract_links() + # video handeling self.video_extractor.get_videos() @@ -129,6 +132,9 @@ def crawl(self, crawl_candidate): # post cleanup self.article.top_node = self.extractor.post_cleanup() + # article links + self.article.links = self.extractor.extract_links() + # clean_text self.article.cleaned_text = self.formatter.get_formatted_text() From 0a3303e2183adcb68817552a9e09317ffa37848a Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 03:26:33 +0100 Subject: [PATCH 026/100] #139 - article links extract method --- goose/extractors.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/goose/extractors.py b/goose/extractors.py index 0b1ff636..c2630003 100644 --- a/goose/extractors.py +++ b/goose/extractors.py @@ -250,6 +250,15 @@ def extract_opengraph(self): opengraph_dict.update({attr.split(":")[1]: value}) return opengraph_dict + def extract_links(self): + links = [] + items = self.parser.getElementsByTag(self.article.top_node, 'a') + for i in items: + attr = self.parser.getAttribute(i, 'href') + if attr: + links.append(attr) + return links + def extract_tags(self): node = self.article.doc From cda2ef624d875c332ea9caaed13353946d936bd0 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 03:48:55 +0100 Subject: [PATCH 027/100] #142 - extract authors --- goose/article.py | 3 +++ goose/crawler.py | 1 + goose/extractors.py | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+) diff --git a/goose/article.py b/goose/article.py index b8645f3d..c37f7d5e 100644 --- a/goose/article.py +++ b/goose/article.py @@ -74,6 +74,9 @@ def __init__(self): # holds links found in the main article self.links = [] + # hold author names + self.authors = [] + # stores the final URL that we're going to try # and fetch content against, this would be expanded if any self.final_url = u"" diff --git a/goose/crawler.py b/goose/crawler.py index e23a72e9..cf124f45 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -105,6 +105,7 @@ def crawl(self, crawl_candidate): self.article.canonical_link = self.extractor.get_canonical_link() self.article.domain = self.extractor.get_domain() self.article.tags = self.extractor.extract_tags() + self.article.authors = self.extractor.extract_authors() # opengraph self.article.opengraph = self.extractor.extract_opengraph() diff --git a/goose/extractors.py b/goose/extractors.py index c2630003..fd3fd62e 100644 --- a/goose/extractors.py +++ b/goose/extractors.py @@ -259,6 +259,25 @@ def extract_links(self): links.append(attr) return links + def extract_authors(self): + authors = [] + author_nodes = self.parser.getElementsByTag( + self.article.doc, + attr='itemprop', + value='author') + + for author in author_nodes: + name_nodes = self.parser.getElementsByTag( + author, + attr='itemprop', + value='name') + + if len(name_nodes) > 0: + name = self.parser.getText(name_nodes[0]) + authors.append(name) + + return list(set(authors)) + def extract_tags(self): node = self.article.doc From 675c077b6173d890e1a20918db3a919bd45f5105 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 05:01:32 +0100 Subject: [PATCH 028/100] #169 - extract tweets --- goose/article.py | 3 +++ goose/crawler.py | 3 +++ goose/extractors.py | 9 +++++++++ 3 files changed, 15 insertions(+) diff --git a/goose/article.py b/goose/article.py index c37f7d5e..093a9d96 100644 --- a/goose/article.py +++ b/goose/article.py @@ -67,6 +67,9 @@ def __init__(self): # holds a dict of all opengrah data found self.opengraph = {} + # holds twitter embeds + self.tweets = [] + # holds a list of any movies # we found on the page like youtube, vimeo self.movies = [] diff --git a/goose/crawler.py b/goose/crawler.py index cf124f45..6afdb5f1 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -123,6 +123,9 @@ def crawl(self, crawl_candidate): # article links self.article.links = self.extractor.extract_links() + # tweets + self.article.tweets = self.extractor.extract_tweets() + # video handeling self.video_extractor.get_videos() diff --git a/goose/extractors.py b/goose/extractors.py index fd3fd62e..780a5d94 100644 --- a/goose/extractors.py +++ b/goose/extractors.py @@ -259,6 +259,15 @@ def extract_links(self): links.append(attr) return links + def extract_tweets(self): + tweets = [] + items = self.parser.getElementsByTag(self.article.top_node, tag='blockquote', attr="class", value="twitter-tweet") + for i in items: + for attr in ['gravityScore', 'gravityNodes']: + self.parser.delAttribute(i, attr) + tweets.append(self.parser.nodeToString(i)) + return tweets + def extract_authors(self): authors = [] author_nodes = self.parser.getElementsByTag( From af493289f30a35d5eb9ff0a2b6151c226fc1e921 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Mon, 29 Dec 2014 05:02:28 +0100 Subject: [PATCH 029/100] #169 - extract tweets --- goose/extractors.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/goose/extractors.py b/goose/extractors.py index 780a5d94..57ad0b92 100644 --- a/goose/extractors.py +++ b/goose/extractors.py @@ -261,11 +261,17 @@ def extract_links(self): def extract_tweets(self): tweets = [] - items = self.parser.getElementsByTag(self.article.top_node, tag='blockquote', attr="class", value="twitter-tweet") + items = self.parser.getElementsByTag( + self.article.top_node, + tag='blockquote', + attr="class", + value="twitter-tweet") + for i in items: for attr in ['gravityScore', 'gravityNodes']: self.parser.delAttribute(i, attr) tweets.append(self.parser.nodeToString(i)) + return tweets def extract_authors(self): From 90b3cac4d6a1b07bd7249864d2e29d249375b0d1 Mon Sep 17 00:00:00 2001 From: Sergey Kirillov Date: Mon, 29 Dec 2014 14:00:27 +0200 Subject: [PATCH 030/100] Replaced bare except with except Exception --- goose/images/utils.py | 2 +- goose/network.py | 2 +- goose/text.py | 2 +- goose/version.py | 2 +- setup.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/goose/images/utils.py b/goose/images/utils.py index a70c6b5d..daf5eddb 100644 --- a/goose/images/utils.py +++ b/goose/images/utils.py @@ -119,5 +119,5 @@ def fetch(self, http_client, src): f = urllib2.urlopen(req) data = f.read() return data - except: + except Exception: return None diff --git a/goose/network.py b/goose/network.py index 0a338a44..666a7d61 100644 --- a/goose/network.py +++ b/goose/network.py @@ -51,7 +51,7 @@ def get_html(self, url): self.result = urllib2.urlopen( self.request, timeout=self.config.http_timeout) - except: + except Exception: self.result = None # read the result content diff --git a/goose/text.py b/goose/text.py index 4008d62b..badbfadc 100644 --- a/goose/text.py +++ b/goose/text.py @@ -46,7 +46,7 @@ def encodeValue(value): value = smart_unicode(value) except (UnicodeEncodeError, DjangoUnicodeDecodeError): value = smart_str(value) - except: + except Exception: value = string_org return value diff --git a/goose/version.py b/goose/version.py index 43693f9c..875065c7 100644 --- a/goose/version.py +++ b/goose/version.py @@ -21,5 +21,5 @@ limitations under the License. """ -version_info = (1, 0, 22) +version_info = (1, 0, 23) __version__ = ".".join(map(str, version_info)) diff --git a/setup.py b/setup.py index 2e2b74c0..ebad2547 100644 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ try: with open(os.path.join(os.path.dirname(__file__), 'README.rst')) as f: long_description = f.read() -except: +except Exception: long_description = description setup(name='goose-extractor', From 90b041dbcd45a790f6cca4b0055094345df5bb13 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 00:32:47 +0100 Subject: [PATCH 031/100] #171 - do not increment version yet --- goose/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/version.py b/goose/version.py index 875065c7..43693f9c 100644 --- a/goose/version.py +++ b/goose/version.py @@ -21,5 +21,5 @@ limitations under the License. """ -version_info = (1, 0, 23) +version_info = (1, 0, 22) __version__ = ".".join(map(str, version_info)) From 848acf8fba6f3cfbd4934569bec4f46fce71c00e Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 00:53:02 +0100 Subject: [PATCH 032/100] #172 - tweet extraction tests --- tests/data/extractors/test_tweet.html | 21 +++++++++++++++++++++ tests/data/extractors/test_tweet.json | 6 ++++++ tests/extractors.py | 9 +++++++++ 3 files changed, 36 insertions(+) create mode 100644 tests/data/extractors/test_tweet.html create mode 100644 tests/data/extractors/test_tweet.json diff --git a/tests/data/extractors/test_tweet.html b/tests/data/extractors/test_tweet.html new file mode 100644 index 00000000..0a390dd8 --- /dev/null +++ b/tests/data/extractors/test_tweet.html @@ -0,0 +1,21 @@ + + +
+

+ TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. +

+ + Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

+
+
+

+ TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

+ + + +

+
+ + diff --git a/tests/data/extractors/test_tweet.json b/tests/data/extractors/test_tweet.json new file mode 100644 index 00000000..80986ad6 --- /dev/null +++ b/tests/data/extractors/test_tweet.json @@ -0,0 +1,6 @@ +{ + "url": "http://exemple.com/tweet/", + "expected": { + "tweets": 2 + } +} diff --git a/tests/extractors.py b/tests/extractors.py index eba69def..9a850003 100644 --- a/tests/extractors.py +++ b/tests/extractors.py @@ -439,6 +439,15 @@ def extract(self, instance): return article +class TestArticleTweet(TestExtractionBase): + + def test_tweet(self): + article = self.getArticle() + number_tweets = len(article.tweets) + expected_number_tweets = self.data['expected']['tweets'] + self.assertEqual(number_tweets, expected_number_tweets) + + class TestArticleTags(TestExtractionBase): def test_tags_kexp(self): From 321fb86e0938e8a95d289fd2e111dd4083a748bb Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 01:05:04 +0100 Subject: [PATCH 033/100] #173 - authors extraction test case --- tests/data/extractors/test_author_schema.html | 12 ++++++++++++ tests/data/extractors/test_author_schema.json | 11 +++++++++++ tests/extractors.py | 8 ++++++++ 3 files changed, 31 insertions(+) create mode 100644 tests/data/extractors/test_author_schema.html create mode 100644 tests/data/extractors/test_author_schema.json diff --git a/tests/data/extractors/test_author_schema.html b/tests/data/extractors/test_author_schema.html new file mode 100644 index 00000000..da7cfab4 --- /dev/null +++ b/tests/data/extractors/test_author_schema.html @@ -0,0 +1,12 @@ + + + +
+

+ TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

+
+ + diff --git a/tests/data/extractors/test_author_schema.json b/tests/data/extractors/test_author_schema.json new file mode 100644 index 00000000..32185d65 --- /dev/null +++ b/tests/data/extractors/test_author_schema.json @@ -0,0 +1,11 @@ +{ + "url": "http://exemple.com/tweet/", + "expected": { + "authors": [ + "KEVIN SACK", + "ADAM NOSSITER", + "PAM BELLUCK", + "SHERI FINK" + ] + } +} diff --git a/tests/extractors.py b/tests/extractors.py index 9a850003..2c9ccf86 100644 --- a/tests/extractors.py +++ b/tests/extractors.py @@ -448,6 +448,14 @@ def test_tweet(self): self.assertEqual(number_tweets, expected_number_tweets) +class TestArticleAuthor(TestExtractionBase): + + def test_author_schema(self): + article = self.getArticle() + fields = ['authors'] + self.runArticleAssertions(article=article, fields=fields) + + class TestArticleTags(TestExtractionBase): def test_tags_kexp(self): From 989ab243efac255b6347e0af4ec2f53d9ceba74f Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 01:11:52 +0100 Subject: [PATCH 034/100] #175 - links extraction tests --- tests/data/extractors/test_links.html | 16 ++++++++++++++++ tests/data/extractors/test_links.json | 6 ++++++ tests/extractors.py | 9 +++++++++ 3 files changed, 31 insertions(+) create mode 100644 tests/data/extractors/test_links.html create mode 100644 tests/data/extractors/test_links.json diff --git a/tests/data/extractors/test_links.html b/tests/data/extractors/test_links.html new file mode 100644 index 00000000..c097d4ee --- /dev/null +++ b/tests/data/extractors/test_links.html @@ -0,0 +1,16 @@ + + +
+

+ TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. + links + Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

+
+
+

+ TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

+
+ + diff --git a/tests/data/extractors/test_links.json b/tests/data/extractors/test_links.json new file mode 100644 index 00000000..74f1c682 --- /dev/null +++ b/tests/data/extractors/test_links.json @@ -0,0 +1,6 @@ +{ + "url": "http://exemple.com/links/", + "expected": { + "links": 2 + } +} diff --git a/tests/extractors.py b/tests/extractors.py index 2c9ccf86..7d43b705 100644 --- a/tests/extractors.py +++ b/tests/extractors.py @@ -448,6 +448,15 @@ def test_tweet(self): self.assertEqual(number_tweets, expected_number_tweets) +class TestArticleLinks(TestExtractionBase): + + def test_links(self): + article = self.getArticle() + number_links = len(article.links) + expected_number_links = self.data['expected']['links'] + self.assertEqual(number_links, expected_number_links) + + class TestArticleAuthor(TestExtractionBase): def test_author_schema(self): From 96caa3c21afdd8db95ad6d3375a0eb3849a183f0 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 01:47:11 +0100 Subject: [PATCH 035/100] #177 - tags are a list --- goose/article.py | 2 +- goose/extractors.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/goose/article.py b/goose/article.py index 093a9d96..6b144d4f 100644 --- a/goose/article.py +++ b/goose/article.py @@ -62,7 +62,7 @@ def __init__(self): # holds a set of tags that may have # been in the artcle, these are not meta keywords - self.tags = set() + self.tags = [] # holds a dict of all opengrah data found self.opengraph = {} diff --git a/goose/extractors.py b/goose/extractors.py index 57ad0b92..6d8a075d 100644 --- a/goose/extractors.py +++ b/goose/extractors.py @@ -312,7 +312,7 @@ def extract_tags(self): if tag: tags.append(tag) - return set(tags) + return list(set(tags)) def calculate_best_node(self): From 6338f6841f9d3d2f10b586012ffd9817b03817ed Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 01:47:41 +0100 Subject: [PATCH 036/100] #177 - title is empty string by default --- goose/article.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/article.py b/goose/article.py index 6b144d4f..48cb6578 100644 --- a/goose/article.py +++ b/goose/article.py @@ -26,7 +26,7 @@ class Article(object): def __init__(self): # title of the article - self.title = None + self.title = u"" # stores the lovely, pure text from the article, # stripped of html, formatting, etc... From 206f6e23dfed930a7a921bb710f96515c8ded956 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 01:48:11 +0100 Subject: [PATCH 037/100] #177 - info method return article data as dict --- goose/article.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/goose/article.py b/goose/article.py index 48cb6578..c00ca4d6 100644 --- a/goose/article.py +++ b/goose/article.py @@ -106,3 +106,26 @@ def __init__(self): # A property bucket for consumers of goose to store custom data extractions. self.additional_data = {} + + @property + def infos(self): + data = { + "meta": { + "description": self.meta_description, + "lang": self.meta_lang, + "keywords": self.meta_keywords, + "favicon": self.meta_favicon, + "canonical": self.canonical_link, + }, + "domain": self.domain, + "title": self.title, + "cleaned_text": self.cleaned_text, + "opengraph": self.opengraph, + "tags": self.tags, + "tweets": self.tweets, + "movies": self.movies, + "links": self.links, + "authors": self.authors, + "publish_date": self.publish_date + } + return data From 37e24b291f352d40f54c720dd2921f75c53f0e81 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 01:49:55 +0100 Subject: [PATCH 038/100] #177 - add top image to returned dict --- goose/article.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/goose/article.py b/goose/article.py index c00ca4d6..d4885616 100644 --- a/goose/article.py +++ b/goose/article.py @@ -117,6 +117,7 @@ def infos(self): "favicon": self.meta_favicon, "canonical": self.canonical_link, }, + "image": None, "domain": self.domain, "title": self.title, "cleaned_text": self.cleaned_text, @@ -128,4 +129,9 @@ def infos(self): "authors": self.authors, "publish_date": self.publish_date } + + # image + if self.top_image is not None: + data['image'] = self.top_image.src + return data From e452c23fe8b5ebc8eae76aa51aa1ed461f9e3b9a Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 02:41:16 +0100 Subject: [PATCH 039/100] #129 - add issue test case --- tests/data/extractors/test_issue129.html | 1460 ++++++++++++++++++++++ tests/data/extractors/test_issue129.json | 6 + tests/extractors.py | 5 + 3 files changed, 1471 insertions(+) create mode 100644 tests/data/extractors/test_issue129.html create mode 100644 tests/data/extractors/test_issue129.json diff --git a/tests/data/extractors/test_issue129.html b/tests/data/extractors/test_issue129.html new file mode 100644 index 00000000..9f523cbc --- /dev/null +++ b/tests/data/extractors/test_issue129.html @@ -0,0 +1,1460 @@ + + + + + + + + + + + + + + + + + +Lost in JIT: PyPy and the road towards SciPy + + + + + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+ + +
+ +

Thursday, October 27, 2011

+ +
+ +
+
+ + + +

+PyPy and the road towards SciPy +

+
+
+
+
+

Hello


Recent PyPys effort to bring NumPy and the associated fundraiser
caused a lot of discussion in the SciPy community regarding PyPy, NumPy,
SciPy and the future of numeric computing in Python.


There were discussions on the topic as well as various blog posts
from the SciPy community who addressed few issues. It seems there was a lot
of talking past each other and I would like to clarify on a few points here,
although this should be taken as my personal opinion on the subject.


So, let's start from the beginning. There are no plans for PyPy to
reimplement everything that's out there in RPython. That has been pointed
out from the beginning as a fallacy of our approach -- we simply don't plan
to do that. We agree that Python is a great glue language and we would like
to keep it that way. PyPy can nicely interface with C using ctypes with
a slightly worse story for C++ (even though there were experiments).
What we know by now is that CPython C API is not a very good glue for PyPy,
it's too tied to CPython and it prevents a lot of interesting optimizations
from happening. The contenders are a few with Cython being a favorite
for now, however for Cython to be usable we need to have a story for C++
(I know Cython does have a story but it's unclear how that would work with
the PyPy backend).


Which brings me to second point that while a lot of code in packages like
SciPy or matplotlib should be reusable in PyPy, it's probably not in
the current form. Either a lot of it has to move to Cython or some other
way of interfacing with C will come across. This should make it clear that
we want to interface with SciPy and reuse as much as possible.


Another recurring topic that seems to pop up is why we just don't reuse Cython
for NumPy instead of reimplementing everything. The problem is that we need
a robust array type with all the interface before we can start using Cython
for anything. Since we're going to implement it anyway, why not go all the way
and implement the full NumPy module? And that is the topic of the current
funding proposal is exactly that -- to provide full NumPy module. That
would be a very good start for integrating the full stack of SciPy and
matplotlib and all other libraries out there.


But also the trick is that a robust array module can go a long way alone.
It allows you to prototype a lot of algorithms on it's own and generally has
it's uses, without having to worry "but if I read all the elements from the
array it's going to be dog slow".


The last accusation is that we're trying to split the community. The answer is
simply no. We have a relatively good roadmap how to get to support what's out
there in scientific community and ideally support all people out there. This
will however take some time and the group of people that can run their
stuff on top of PyPy will be growing over time. This is indeed precisely what
is happening in other areas of python world -- more and more stuff run on PyPy
and people find it more and more interesting to try and to adapt their
own stuff to run.


To summarize, I don't really think there is that much of a gap between us
and SciPy people. We'll start small (by providing full NumPy implementation)
and then gradually move forward reusing as much as possible from the entire
stack.


Cheers,
fijal

+
+
+ +
+
+ +

7 comments:

+
+ + +
+
  1. I'm going to play devil's advocate and ask the question of why PyPy should care one bit about the existing Numpy implementation or supporting C++ right now. I think it would be cool if the PyPy folks simply built the array type that *they* want. Make it fast. Do every kind of crazy optimization you can think of with it. Sure, call it something other than numpy to start, but make it something that programmers who want to live on the bleeding edge can play around it and try out (I know I'd be interested in messing around with something like that). Providing full numpy compatibility and all of that can come later on after more experience has been gained.

    ReplyDelete
  2. Hi Dave.

    If you download PyPy nightly, you can play with numpy.array that does exactly this. We're working on adding features (like multi dimensional arrays) and simply numpy API is kind of good.

    ReplyDelete
  3. The numpy interface is battle-tested over many years of use, and is pretty flexible. I am usually pleasantly surprised when applying it to new problems.

    Given the effort required to integrate a multidimensional array type into PyPy, I don't think it makes sense to try to reinvent the wheel by designing a completely new API. I could see someone experimenting with the API after a numpy-derived core is in place.

    ReplyDelete
  4. You can write "full" in bold, but that doesn't make it so. It should be clear to you by now that by claiming to provide a full numpy implementation you are at the very least confusing the issue for many users. To spell it out once more, here is what numpy provides and what you plan to implement:

    - Python API; ndarray, dtypes (yes)
    - C API; ndarray, dtypes (no)
    - Fourier transforms (no - I think)
    - Linear algebra (no - I think)
    - Random number generators (yes - I think)

    Furthermore, several people (Travis, David, Ian, Dave Beazley above) mentioned you shouldn't call your implementation numpy. Before you were using micronumpy, that makes a lot more sense.

    ReplyDelete
  5. When I mean full, I mean full. It's all yes in your table except the C API. The way to call C using those arrays will be provided, but not using the CPython C API.

    We'll rename it to numpypy for time being (at least until it's reasonably complete).

    ReplyDelete
  6. I'm not quite sure why people are getting so fussed about it. Most of the work in SciPy is in the C code, and it will still be easy to point some algorithm written in C at the memory held by the new PyPy arrays as it is in the current numpy.

    Why would people use PyPy for science if it's implementation of numpy was slower than CPythons one? They wouldn't, so that's why PyPy can't expose the existing CPython C API, as simply the act of exposing that API would make it much slower, due to the overhead of simulating ref-counting etc. There's no point PyPy trying to make a numpy implementation that exposes the CPython C API.

    ReplyDelete
  7. I think that linear algebra and Fourier transforms are frequently needed.
    Come on guys, lets donate:
    http://pypy.org/numpydonate.html

    ReplyDelete
+
+
+
+ +

+

+ + + + +
+

+ +
+
+ +
+ + +
+
+ +Newer Post + + +Older Post + +Home +
+
+
+ +
+ +
+
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+ +
+
+
+
+
+
+
+
+ +
+ +
+
+
+
+
+
+
+
+ + + + + + diff --git a/tests/data/extractors/test_issue129.json b/tests/data/extractors/test_issue129.json new file mode 100644 index 00000000..ddf6cbc8 --- /dev/null +++ b/tests/data/extractors/test_issue129.json @@ -0,0 +1,6 @@ +{ + "url": "http://lostinjit.blogspot.fr/2011/10/pypy-and-road-towards-scipy.html", + "expected": { + "cleaned_text": "Recent PyPys effort to bring NumPy and the associated fundraiser" + } +} diff --git a/tests/extractors.py b/tests/extractors.py index 7d43b705..458a0705 100644 --- a/tests/extractors.py +++ b/tests/extractors.py @@ -365,6 +365,11 @@ def test_opengraph(self): fields = ['opengraph'] self.runArticleAssertions(article=article, fields=fields) + def test_issue129(self): + article = self.getArticle() + fields = ['cleaned_text'] + self.runArticleAssertions(article=article, fields=fields) + class TestPublishDate(TestExtractionBase): From a36b5a8ae1291fdf6e7e7e3e469ec3768faa7cfa Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 02:41:44 +0100 Subject: [PATCH 040/100] #129 - force articleBody to be the document root if found --- goose/crawler.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/goose/crawler.py b/goose/crawler.py index 6afdb5f1..e25c7fee 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -110,6 +110,13 @@ def crawl(self, crawl_candidate): # opengraph self.article.opengraph = self.extractor.extract_opengraph() + # check for an articleBody + # if we find one force the article.doc to be the articleBody node + # this will prevent the cleaner to remove unwanted text content + article_body = self.extractor.get_articlebody() + if article_body is not None: + self.article.doc = article_body + # before we do any calcs on the body itself let's clean up the document self.article.doc = self.cleaner.clean() From b04f1e9f82c917f6aca648a65e6c484f7bb5dfb9 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 03:09:47 +0100 Subject: [PATCH 041/100] #137 - opengraph title test case --- tests/data/extractors/test_title_opengraph.html | 13 +++++++++++++ tests/data/extractors/test_title_opengraph.json | 6 ++++++ tests/extractors.py | 5 +++++ 3 files changed, 24 insertions(+) create mode 100644 tests/data/extractors/test_title_opengraph.html create mode 100644 tests/data/extractors/test_title_opengraph.json diff --git a/tests/data/extractors/test_title_opengraph.html b/tests/data/extractors/test_title_opengraph.html new file mode 100644 index 00000000..dbafee7a --- /dev/null +++ b/tests/data/extractors/test_title_opengraph.html @@ -0,0 +1,13 @@ + + + + Wrong article title - website + + +
+

+ TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

+
+ + diff --git a/tests/data/extractors/test_title_opengraph.json b/tests/data/extractors/test_title_opengraph.json new file mode 100644 index 00000000..b4b6cdea --- /dev/null +++ b/tests/data/extractors/test_title_opengraph.json @@ -0,0 +1,6 @@ +{ + "url": "http://exemple.com/test_opengraphcontent", + "expected": { + "title": "Good article title" + } +} diff --git a/tests/extractors.py b/tests/extractors.py index 458a0705..ff4825ed 100644 --- a/tests/extractors.py +++ b/tests/extractors.py @@ -365,6 +365,11 @@ def test_opengraph(self): fields = ['opengraph'] self.runArticleAssertions(article=article, fields=fields) + def test_title_opengraph(self): + article = self.getArticle() + fields = ['title'] + self.runArticleAssertions(article=article, fields=fields) + def test_issue129(self): article = self.getArticle() fields = ['cleaned_text'] From 655aca6424c549238bc4ab80ee2ea10c212f5f03 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 04:36:44 +0100 Subject: [PATCH 042/100] #137 - test separator --- tests/data/extractors/test_title_opengraph.html | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/data/extractors/test_title_opengraph.html b/tests/data/extractors/test_title_opengraph.html index dbafee7a..6e6c0c64 100644 --- a/tests/data/extractors/test_title_opengraph.html +++ b/tests/data/extractors/test_title_opengraph.html @@ -1,6 +1,7 @@ - + + Wrong article title - website From 3ff269e8ef32f8795e0de7d7954cb9902b5be7bf Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 04:37:16 +0100 Subject: [PATCH 043/100] #137 - use og:title in test case --- tests/data/extractors/test_time.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/data/extractors/test_time.json b/tests/data/extractors/test_time.json index 31341c9c..05cb400c 100644 --- a/tests/data/extractors/test_time.json +++ b/tests/data/extractors/test_time.json @@ -6,8 +6,8 @@ "final_url": "http://www.time.com/time/health/article/0,8599,2011497,00.html", "meta_keywords": "bp, oil, spill, gulf, mexico, invisible, dispersed, deepwater horizon, Charles Hopkinson", "cleaned_text": "This month, the federal government released", - "title": "Invisible Oil from BP Spill May Threaten Gulf Aquatic Life", + "title": "Oil from Spill Could Still Pose Major Threat", "meta_favicon": "http://img.timeinc.net/time/favicon.ico", "meta_lang": null } -} \ No newline at end of file +} From d31112b34374973f49846ee74f578453062c37f2 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 04:37:43 +0100 Subject: [PATCH 044/100] #137 - corrected title --- tests/data/extractors/test_allnewlyrics1.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/data/extractors/test_allnewlyrics1.json b/tests/data/extractors/test_allnewlyrics1.json index 4f8e8cc1..53cd1cf8 100644 --- a/tests/data/extractors/test_allnewlyrics1.json +++ b/tests/data/extractors/test_allnewlyrics1.json @@ -10,8 +10,8 @@ "PJ Morton", "Stevie Wonder" ], - "title": "PJ Morton (Ft. Stevie Wonder)", + "title": "\u201cOnly One\u201d Lyrics : PJ Morton (Ft. Stevie Wonder)", "meta_favicon": "", "meta_lang": "en" } -} \ No newline at end of file +} From 0e370dc900bd11b890657dd082f1255ef6d96cbe Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 04:38:05 +0100 Subject: [PATCH 045/100] #137 - corrected title --- tests/data/extractors/test_cnn1.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/data/extractors/test_cnn1.json b/tests/data/extractors/test_cnn1.json index b847add0..ced9eb91 100644 --- a/tests/data/extractors/test_cnn1.json +++ b/tests/data/extractors/test_cnn1.json @@ -6,8 +6,8 @@ "final_url": "http://www.cnn.com/2010/POLITICS/08/13/democrats.social.security/index.html", "meta_keywords": "", "cleaned_text": "Washington (CNN) -- Democrats pledged ", - "title": "Democrats to use Social Security against GOP this fall", + "title": "Democrats to use Social Security against GOP this fall - CNN.com", "meta_favicon": "http://i.cdn.turner.com/cnn/.element/img/3.0/global/misc/apple-touch-icon.png", "meta_lang": "en" } -} \ No newline at end of file +} From 66b63fcf0e1f44629c9164b92d74e72e894a56d6 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 04:39:16 +0100 Subject: [PATCH 046/100] #137 - fetch opengraph before title --- goose/crawler.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/goose/crawler.py b/goose/crawler.py index e25c7fee..43aaf4ea 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -95,9 +95,9 @@ def crawl(self, crawl_candidate): self.article.raw_html = raw_html self.article.doc = doc self.article.raw_doc = deepcopy(doc) + self.article.opengraph = self.extractor.extract_opengraph() self.article.publish_date = self.extractor.get_publish_date() # self.article.additional_data = config.get_additionaldata_extractor.extract(doc) - self.article.title = self.extractor.get_title() self.article.meta_lang = self.extractor.get_meta_lang() self.article.meta_favicon = self.extractor.get_favicon() self.article.meta_description = self.extractor.get_meta_description() @@ -106,9 +106,7 @@ def crawl(self, crawl_candidate): self.article.domain = self.extractor.get_domain() self.article.tags = self.extractor.extract_tags() self.article.authors = self.extractor.extract_authors() - - # opengraph - self.article.opengraph = self.extractor.extract_opengraph() + self.article.title = self.extractor.get_title() # check for an articleBody # if we find one force the article.doc to be the articleBody node From bd96c943393f271fb04b82ecc6d0b7fff1b59c19 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 04:39:57 +0100 Subject: [PATCH 047/100] #137 - refactor title extraction based on opengraph, meta headling and title element --- goose/extractors.py | 86 +++++++++++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 30 deletions(-) diff --git a/goose/extractors.py b/goose/extractors.py index 6d8a075d..07f5321f 100644 --- a/goose/extractors.py +++ b/goose/extractors.py @@ -30,6 +30,7 @@ MOTLEY_REPLACEMENT = StringReplacement("�", "") ESCAPED_FRAGMENT_REPLACEMENT = StringReplacement(u"#!", u"?_escaped_fragment_=") TITLE_REPLACEMENTS = ReplaceSequence().create(u"»").append(u"»") +TITLE_SPLITTERS = [u"|", u"-", u"»", u":"] PIPE_SPLITTER = StringSplitter("\\|") DASH_SPLITTER = StringSplitter(" - ") ARROWS_SPLITTER = StringSplitter("»") @@ -65,44 +66,69 @@ def __init__(self, config, article): # stopwords class self.stopwords_class = config.stopwords_class + def clean_title(self, title): + """Clean title with the use of og:site_name + in this case try to get ride of site name + and use TITLE_SPLITTERS to reformat title + """ + # check if we have the site name in opengraph data + if "site_name" in self.article.opengraph.keys(): + site_name = self.article.opengraph['site_name'] + # remove the site name from title + title = title.replace(site_name, '').strip() + + # try to remove the domain from url + if self.article.domain: + pattern = re.compile(self.article.domain, re.IGNORECASE) + title = pattern.sub("", title).strip() + + # split the title in words + # TechCrunch | my wonderfull article + # my wonderfull article | TechCrunch + title_words = title.split() + + # check if first letter is in TITLE_SPLITTERS + # if so remove it + if title_words[0] in TITLE_SPLITTERS: + title_words.pop(0) + + # check if last letter is in TITLE_SPLITTERS + # if so remove it + if title_words[-1] in TITLE_SPLITTERS: + title_words.pop(-1) + + # rebuild the title + title = u" ".join(title_words).strip() + + return title + def get_title(self): """\ Fetch the article title and analyze it """ - title = '' - doc = self.article.doc - title_element = self.parser.getElementsByTag(doc, tag='title') - # no title found - if title_element is None or len(title_element) == 0: - return title + # rely on opengraph in case we have the data + if "title" in self.article.opengraph.keys(): + title = self.article.opengraph['title'] + return self.clean_title(title) - # title elem found - title_text = self.parser.getText(title_element[0]) - used_delimeter = False - - # split title with | - if '|' in title_text: - title_text = self.split_title(title_text, PIPE_SPLITTER) - used_delimeter = True - - # split title with - - if not used_delimeter and '-' in title_text: - title_text = self.split_title(title_text, DASH_SPLITTER) - used_delimeter = True - - # split title with » - if not used_delimeter and u'»' in title_text: - title_text = self.split_title(title_text, ARROWS_SPLITTER) - used_delimeter = True - - # split title with : - if not used_delimeter and ':' in title_text: - title_text = self.split_title(title_text, COLON_SPLITTER) - used_delimeter = True + # try to fetch the meta headline + meta_headline = self.parser.getElementsByTag( + self.article.doc, + tag="meta", + attr="name", + value="headline") + if meta_headline is not None and len(meta_headline) > 0: + title = self.parser.getAttribute(meta_headline[0], 'content') + return self.clean_title(title) + + # otherwise use the title meta + title_element = self.parser.getElementsByTag(self.article.doc, tag='title') + if title_element is not None and len(title_element) > 0: + title = self.parser.getText(title_element[0]) + return self.clean_title(title) - title = MOTLEY_REPLACEMENT.replaceAll(title_text) return title def split_title(self, title, splitter): From 148ce9bd6c84fe761f05d33f55a54da24b9a72c7 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 04:40:36 +0100 Subject: [PATCH 048/100] #137 - more explicit error message --- tests/extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/extractors.py b/tests/extractors.py index ff4825ed..dccae5b2 100644 --- a/tests/extractors.py +++ b/tests/extractors.py @@ -125,7 +125,7 @@ def runArticleAssertions(self, article, fields): continue # default assertion - msg = u"Error %s" % field + msg = u"Error %s \nexpected: %s\nresult: %s" % (field, expected_value, result_value) self.assertEqual(expected_value, result_value, msg=msg) def extract(self, instance): From e404f1bd0bf4a0571e53378aa8ab87c287a914d6 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 07:02:06 +0100 Subject: [PATCH 049/100] #115 - remove businessinsider tests case due to no valid html --- .../extractors/test_businessinsider1.html | 2211 ---------------- .../extractors/test_businessinsider1.json | 12 - .../extractors/test_businessinsider2.html | 2278 ----------------- .../extractors/test_businessinsider2.json | 12 - tests/extractors.py | 10 - 5 files changed, 4523 deletions(-) delete mode 100644 tests/data/extractors/test_businessinsider1.html delete mode 100644 tests/data/extractors/test_businessinsider1.json delete mode 100644 tests/data/extractors/test_businessinsider2.html delete mode 100644 tests/data/extractors/test_businessinsider2.json diff --git a/tests/data/extractors/test_businessinsider1.html b/tests/data/extractors/test_businessinsider1.html deleted file mode 100644 index 18603a35..00000000 --- a/tests/data/extractors/test_businessinsider1.html +++ /dev/null @@ -1,2211 +0,0 @@ - - - - - MEANWHILE: Developments In Greece... - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
-
- - - - - - - -
-
- -
- - - -
-
-
-
- -
-
- -
- Business Insider - - Business Insider - - -
-
- - - - - - -
-
-
-
- -
- -
    -
  • - -
  • -
  • - - -
  • - - -
-
-
-
-
- -
- -
-
- -
- -
- - -
- - - - - - - -
-
- -
-
- - -

MEANWHILE: Developments In Greece...

- -
-
- - - - - - -
- -
-
- -

As everyone in the world was transfixed on the Fed, Greece continues to do what it takes to get its next bailout tranche and stay in the Eurozone.

- -

The Greek cabinet agreed on yet another round of austerity measures today. The details come to us from Reuters:

-
    -
  • Pensions of more than $1,642.913 a month wil be cut by 20%
  • -
  • Payments to state workers who retired before age 55 will be reduced
  • -
  • 30,000 civil servants will go into "labor reserve"-- that means their pay will be reduced to 60% of their salaries while they have 12 months to find a new job in the state sector or be laid off.
  • -
-

The government also promised to extend a new real estate, intended to end next year, til 2014.

- -

An official sign-off from the Troika is still not official AND, according to journalist Matina Stevis, there still needs to be a parliament vote.

-
- -
-
-

- Please follow Money Game on Twitter and Facebook. -
Follow Linette Lopez on Twitter. -
Ask Linette A Question > - -

- - - - - -
- -
- - - -
-
- -
- -
- -

-
-
- - - -
-
- - Share: -
-

- - - - - - - -
- - - - - -
-
-
-
-
-
-
- - writes primarily for Money Game, but dabbles in Clusterstock and Europe. She also edits the site's podcasts.
- - Contact: - -
-
e-mail:
-
- - - - -
- - Subscribe to her - - - - twitter feed
- -
-
-
- -
-
- - - -
- - - - -
- -
- -

The Water Cooler
-
-
- Receive email updates on new comments! -
- -
- 5 Comments - - -

-
- -
-
- - -
-
-
-
-
- - - 0 - -
-
- - 2 - - -
- -
- - - - - facebook_expert - - - - - on - - Sep 21, 3:29 PM - said: - - - - -
- -
- Reply - -
-
- -
-
- -
-
- - -
-
- -
-
- -
- - - 0 - -
-
- - -
-
- - - - zorba the meek - - - - - on - - Sep 21, 3:38 PM - - said: - - - -
- - -
- the average stavros souvlaki on the street how only two choices 1) leave the country which will soon resemble east germany during the cold war or 2) order the baklava, it's delicious.
- -
-
- -
- Reply -
-
- -
-
- -
-
- - - -
-
-
-
-
- - 0 - - -
- -
- -
-
- - - - depression - - - - - - on - - Sep 21, 3:56 PM - said: - - - -
- - -
- these types of cuts are unthinkable ......
- -
-
- -
- Reply -
-
- -
- -
- -
-
- - -
-
-
-
-
- - - 2 - -
-
- - 0 - - -
- -
- -
- - - - marktheshark - - - - - on - - Sep 21, 4:23 PM - said: - - - -
- - -
- Greece will blow up right after the next election. Whoever is up against the status quo will demand "hope" and "change", promising to return to the Drachma and preserve social spending. Papandreou is obviously on his way out, no Prime Minister who is rioted against stays in power in a legitimate democracy. If we don't see any difference with the next party in power, Greece will rip itself apart. The birthplace of democracy will be the deathbed of it.
- -
-
- -
- Reply - -
-
- -
-
- -
-
- - -
-
- -
-
- -
- - - 1 - -
-
- - -
-
- - - - Jamie Sims - - - - - on - - Sep 22, 5:00 AM - - said: - - - -
- - -
- Why is it the rich of Greece has still got away without paying as most of them got their fortunes through corruption. We want the government to be punished for their poor behavior towards their country and people but unfortunately it is the people who are paying the price with their lives, jobs, houses. The Government still has not collected taxes from 2009. There is still tax free on boats which the government loses out on 300 million euro per year because of a stupid loop hole and not one politician has taken a pay cut. and yet when people were starving and the markets were in turmoil the government went on their summer hols, now is that the behavior of a government who is determined to make change?
- -
-
- -
- Reply -
-
- -
-
- -
-
- -
- - - -
- - - - - -
-

- - Join the discussion with Business Insider -
- Login With Facebook - -

- - -
-
- - - - - -
- -
- - -
- -
- - -
-
- - -
- - -
- - -
- - - - - - - -
- -
-
-
- -
- - - - - - - - -
- - - -
- -
-
-
-
- -
- - - - - - - - - - -
-

Facebook's New Layout: Pros & Cons

-
- -
- -
- -
-
- - - - -
-
- -
-

Get Money Game Emails & Alerts

- - -

Learn More »

- - -
-
-
- - - - -
-

Advertisement

- -
- - -
- -
- -
-

Your Money

-
- - - - - - - - - - - - - - - - - - -
NASDAQ Composite2,456+27.56(+1.122%)
S&P 5001,130+6.87(+0.608%)
NYSE Composite6,727+44.11(+0.656%)
-
- - -
- -
-
- - - -

Active Users on BI right now...
Click for more live stats »

-
- - - - - - - - - -
- - Get Business Insider Mobile - -
- - - -
-

- Startup Document Center -

- -
Templates To Jump Start Your Business
- -
- - - - -
- -
- - - - - - -
- - - - - -
- - - - - - - -
- -
- - - -
-

Advertisement

- -
- - - -
-
- - - - - -
-

Thanks to our partners

-
- Datapipe - OpenX - Catchpoint - Web Performance Monitoring - - Ooyala - Ad-Juster - Financial Content -
-
-
- -
- -
- -
- - - - -
-
- -
- -
- - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/tests/data/extractors/test_businessinsider1.json b/tests/data/extractors/test_businessinsider1.json deleted file mode 100644 index a12c5838..00000000 --- a/tests/data/extractors/test_businessinsider1.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "url": "http://articles.businessinsider.com/2011-09-21/markets/30183619_1_parliament-vote-greece-civil-servants", - "expected": { - "meta_description": "More moves to survive.", - "domain": "articles.businessinsider.com", - "final_url": "http://articles.businessinsider.com/2011-09-21/markets/30183619_1_parliament-vote-greece-civil-servants", - "meta_keywords": "Economy, Greece, Austerity, Economic Crisis, Eurozone, Euro, Europe, Linette Lopez", - "cleaned_text": "As everyone in the world was transfixed on the Fed", - "meta_favicon": "http://static7.businessinsider.com/assets/images/faviconBI.ico", - "meta_lang": "en" - } -} \ No newline at end of file diff --git a/tests/data/extractors/test_businessinsider2.html b/tests/data/extractors/test_businessinsider2.html deleted file mode 100644 index 56573300..00000000 --- a/tests/data/extractors/test_businessinsider2.html +++ /dev/null @@ -1,2278 +0,0 @@ - - - - - GOLDMAN: 4 Key Points On The FOMC Announcement - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
-
- - - - - - - -
-
- -
- - - -
-
-
-
- -
-
- -
- Business Insider - - Business Insider - - -
-
- - - - - - -
-
-
-
- -
- -
    -
  • - -
  • -
  • - - -
  • - - -
-
-
-
-
- -
- -
-
- -
- -
- - -
- - - - - - - -
-
- -
-
- - -

GOLDMAN: 4 Key Points On The FOMC Announcement

- -
-
- - - - - - -
- -
-
- -

twist

Image: YouTUbe

From Goldman on the FOMC operation twist announcement:

- -

-------------

-

1. As we had expected, the Federal Open Market Committee decided to "do the twist" and increase the duration of its securities holdings by selling shorter-maturity securities ($400bn of Treasuries with maturity of 3 years or less) and buying longer-maturity securities ($400bn of Treasuries with maturity 6-30 years).

2. The Fed chose to maintain the interest rate on excess reserves (IOER) at 25bp, contrary to our expectations of a small cut, but overall the details of today's action were more aggressive than expected in two respects: First, a relatively large portion of the purchases will occur at the long end (29% in the 20-30 year maturity bucket), implying a total impact of more than $400bn in 10-year equivalents, versus market expectations of perhaps $300-350bn. Second, the Fed will reinvest maturing and prepaid agency MBS and agency debt in agency MBS, rather than Treasuries, suggesting a bit more support for the housing sector. The statement retained an easing bias, noting again that the FOMC "is prepared to employ its tools" to "promote a stronger economic recovery in a context of price stability".

3. Consistent with the more aggressive policy easing, the statement emphasizes the weak state of the economy, suggesting "continuing weakness in overall labor market conditions" and "only a modest pace" of growth in consumer spending. The FOMC notes the moderation in (headline) inflation in recent months and, as before, expects it to "settle...at levels at or below those consistent with the Committee's dual mandate". While the FOMC still forecasts some improvement in the pace of growth, "there are significant downside risks to the economic outlook, including strains in global financial markets".

4. Once again, three FOMC members--Dallas Fed President Fisher, Minneapolis Fed President Kocherlakota, and Philadelphia Fed President Plosser--dissented, with the statement noting only that they "did not support additional policy accommodation at this time".

-
- -
-
-

- Please follow Money Game on Twitter and Facebook. -
Follow Joe Weisenthal on Twitter. -
Ask Joe A Question > - -

- - - - - - -
- -
- - -
-
- - -
- -
- -

-
-
- - -
- -
- - Share: -
-

- - - - - - - -
- - - - - -
-
- -
-
-
-
-
- -
- Contact: - -
- -
e-mail:
-
- -
AIM:
-
openfilerook
- -
Work Phone:
-
917-621-7438
- - -
SMS:
-
917-621-7438
- -
- - Subscribe to his - - RSS feed - - | - - twitter feed
- -
-
-
- -
-
- - - -
- - - - -
- -
- -

The Water Cooler
-
-
- Receive email updates on new comments! -
- -
- 6 Comments - - -

-
- -
-
- - -
-
-
-
-
- - - 2 - -
-
- - 6 - - -
- -
- - - - - Beltway Greg - - - - - on - - Sep 21, 3:05 PM - said: - - - -
- - - -
- If you're short you better cover.
- -
-
- -
- Reply -
- -
- -
-
- -
-
- - -
-
-
- -
- - - - -
- - - - QE2 -
- - What are these? Strikes! Earn three of them in a month, and you'll be sent - to the Penalty Box for 24 hours. How do you earn strikes? Write comments that our editors kick to the Bleachers. - Want to get rid of the strikes and start fresh? Write excellent comments that our editors promote - to the Board Room. - -
-
- - - - on - - Sep 21, 3:23 PM - said: - - - -
- - -
- - Cover shorts in what? No FED QE3 money - what's exactly going to push the market up higher in the near term, the sputtering U.S. economy - you seen the data recently? International markets in Europe and Asia? U.S. Housing market? Everything has already peaked - commodities, gold, silver, oil. Only emerging bull markets left are in U.S. Dollar and long bonds now.
- -
-
- -
- Reply -
-
- - -
-
- -
-
- - -
-
-
-
- - - - -
- - - - - facebook - - - - - on - - Sep 21, 3:24 PM - said: - - - -
- - -
- great FOMC ...it's just like QE2 all over again, so this will lay the groundwork for a strong stock market & commodity rally till may 2012.
-Interest rates never to go up again
-Housing, small biz, and job creation just not that important anymore in this profits, earnings, productivity & exports driven economic boom
-http://seekingalpha.com/user/926530/instablog/full_index
- -
-
- -
- Reply -
-
- -
- -
- -
-
- - -
-
-
-
-
- - - 6 - -
-
- - 0 - - -
- -
- - - - - r cohn - - - - - on - - Sep 21, 3:25 PM - said: - - - -
- - - -
- sell all banks, insurance companie beause of margin compression.sell all muni ond because all pension funds will have to reduce their assumed rate of return
-sell all companies who cater to the baby boomers as almost all of them will run out of money in 10 years
-
-Good job Fed .Your actions have reduced the incentive to save and you have destroyed your own banking system.One of the most immoral moves in American history outside of slavery and wars
- -
-
- -
- - Reply -
-
- -
-
- -
-
- - -
- -
-
-
- - - -
- -
-
- - - - Deep Thoughts - - - - - on - - - Sep 21, 11:31 PM - said: - - - -
- - -
- huh? what was that noise?
-Oh it's nothing honey, go back to sleep.
-Alright sweetie...Zzzzzzzz
- -
-
- -
- Reply -
-
- -
- -
- -
-
- - -
-
-
-
-
- - - 0 - -
-
- - 0 - - -
- -
- - - - - rhh - - - - - on - - Sep 22, 1:02 AM - said: - - - -
- - - -
- Operation Peter /Paul.....you are Peter.
- -
-
- -
- Reply -
- -
- -
-
- -
-
- -
- - - -
- - - - - -
-

- Join the discussion with Business Insider -
- Login With Facebook - - -

- - -
-
- - - - - -
- -
- - -
-
- - - -
-
- - -
- - -
- - - -
- - - - - - - -
- -
-
- -
- -
- - - - - - - -
- - - - -
- -
-
-
-
-
- - - - - - - -
-
-
-

Get Money Game Emails & Alerts

- - - -

Learn More »

- - - -
-
-
- - - - -
-

Advertisement

- - -
- - -
-
- -
-

Your Money

-
- - - - - - - - - - - - - - - - - - -
NASDAQ Composite2,456+24.33(+0.991%)
S&P 5001,130+8.19(+0.725%)
NYSE Composite6,727+34.88(+0.519%)
-
- - -
- -
- -
- - -

Active Users on BI right now...
Click for more live stats »

-
- - - - - - - - - - - - - - - -
- - Get Business Insider Mobile - -
- - -
- -

- Startup Document Center -

- -
Templates To Jump Start Your Business
- -
- - - - -
-
- - - - - - -
- - - - - -
- - - - - - - - -
- - -
- - - -
-

Advertisement

- -
- - - -
-
- - - - - -
-

Thanks to our partners

-
- Datapipe - - OpenX - Catchpoint - Web Performance Monitoring - Ooyala - Ad-Juster - Financial Content -
-
-
- -
- -
-
- - - - -
-
- -
- -
- - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/tests/data/extractors/test_businessinsider2.json b/tests/data/extractors/test_businessinsider2.json deleted file mode 100644 index 0329e87e..00000000 --- a/tests/data/extractors/test_businessinsider2.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "url": "http://www.businessinsider.com/goldman-on-the-fed-announcement-2011-9", - "expected": { - "meta_description": "Here it is.", - "domain": "www.businessinsider.com", - "final_url": "http://www.businessinsider.com/goldman-on-the-fed-announcement-2011-9", - "meta_keywords": "Federal Reserve, Joe Weisenthal", - "cleaned_text": "From Goldman on the FOMC operation twist announcement", - "meta_favicon": "http://static7.businessinsider.com/assets/images/faviconBI.ico", - "meta_lang": "en" - } -} \ No newline at end of file diff --git a/tests/extractors.py b/tests/extractors.py index dccae5b2..9969c059 100644 --- a/tests/extractors.py +++ b/tests/extractors.py @@ -285,16 +285,6 @@ def test_politico(self): fields = ['cleaned_text'] self.runArticleAssertions(article=article, fields=fields) - def test_businessinsider1(self): - article = self.getArticle() - fields = ['cleaned_text'] - self.runArticleAssertions(article=article, fields=fields) - - def test_businessinsider2(self): - article = self.getArticle() - fields = ['cleaned_text'] - self.runArticleAssertions(article=article, fields=fields) - def test_businessinsider3(self): article = self.getArticle() fields = ['cleaned_text'] From b5ddaf132bb20913389f5f96e8da2c238c28a3a3 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 07:02:42 +0100 Subject: [PATCH 050/100] #115 - add issue 115 test files --- tests/data/extractors/test_issue115.html | 1740 ++++++++++++++++++++++ tests/data/extractors/test_issue115.json | 6 + tests/extractors.py | 6 + 3 files changed, 1752 insertions(+) create mode 100644 tests/data/extractors/test_issue115.html create mode 100644 tests/data/extractors/test_issue115.json diff --git a/tests/data/extractors/test_issue115.html b/tests/data/extractors/test_issue115.html new file mode 100644 index 00000000..0b968cfc --- /dev/null +++ b/tests/data/extractors/test_issue115.html @@ -0,0 +1,1740 @@ + + + + + + + + + + + + + + Jessica Livingston: Why Startups Need to Focus on Sales, Not Marketing - The Accelerators - WSJ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+
+
+ + + + +
+ + + +
+ +
+ + + + + +
+ + + +
+ + +
+ +
+
+
+ +
+ + + + +
+ +
+
+ + + + +
+ + + 1:26 pm ET
Jun 3, 2014
+

+ Sales/Marketing

+ +

+ Jessica Livingston: Why Startups Need to Focus on Sales, Not Marketing

+ +
+ + + +
+ +
+ + + + + + + + + + + + + + +
+
+ + + +
+
+
+
+
+
+ + + + + + + +
+
    +
    +
    +
    +
    +

    JESSICA LIVINGSTON: The most important thing an early-stage startup should know about marketing is rather counterintuitive: that you probably shouldn’t be doing anything you’d use the term “marketing” to describe. Sales and marketing are two ends of a continuum. At the sales end your outreach is narrow and deep. At the marketing end it is broad and shallow. And for an early stage startup, narrow and deep is what you want — not just in the way you appeal to users, but in the type of product you build. Which means the kind of marketing you should be doing should be indistinguishable from sales: you should be talking to a small number of users who are seriously interested in what you’re making, not a broad audience who are on the whole indifferent.

    +

    Successful startups almost always start narrow and deep. Apple started with a computer Steve Wozniak made to impress his friends at the Homebrew Computer Club. There weren’t a lot of them, but they were really interested. Facebook started out just for Harvard University students. Again, not a lot of potential users, but they really wanted it. Successful startups start narrow and deep partly because they don’t have the power to reach a big audience, so they have to choose a very interested one. But also because the product is still being defined. The conversation with initial users is also market research.

    + + + + + + +
    +
    +
    +
    +
    +
    +

    See what other startup mentors have to say about marketing tactics.

    +
    +

    At Y Combinator, we advise most startups to begin by seeking out some core group of early adopters and then engaging with individual users to convince them to sign up.

    +

    For example, the early adopters of Airbnb were hosts and guests in New York City (Y Combinator funded Airbnb in Winter of 2009). To grow, Airbnb needed to get more hosts and also help existing hosts convert better. So Brian Chesky and Joe Gebbia flew to New York every week to meet with hosts — teaching them how to price their listings, take better photos, and so on. They also asked hosts for introductions to potential new hosts, who they then met in person.

    +

    Stripe (YC S09) was particularly aggressive about signing up users manually at first. The YC alumni network are a good source of early adopters for a service like Stripe. Co-founders Patrick and John Collison worked their way methodically through it, and when someone agreed to try Stripe, the brothers would install it for them on the spot rather than email a link. We now call their technique “Collison installation.”

    +

    Many guest speakers at Y Combinator offer stories about how manual the initial process of getting users was. Pinterest is a mass consumer product, but Ben Silbermann said even he began by recruiting users manually. Ben would literally walk into cafes in Palo Alto and ask random people to try out Pinterest while he gathered feedback over their shoulders.

    +

    The danger of the term “marketing” is that it implies the opposite end of the sales/marketing spectrum from the one startups should be focusing on. And just as focusing on the right end has a double benefit — you acquire users and define the product — focusing on the wrong end is doubly dangerous, because you not only fail to grow, but you can remain in denial about your product’s lameness.

    +

    All too often, I’ve seen founders build some initially mediocre product, announce it to the world, find that users never show up, and not know what to do next. As well as not getting any users, the startup never gets the feedback it needs to improve the product.

    +

    So why wouldn’t all founders start by engaging with users individually? Because it’s hard and demoralizing. Sales gives you a kind of harsh feedback that “marketing” doesn’t. You try to convince someone to use what you’ve built, and they won’t. These conversations are painful, but necessary. I suspect from my experience that founders who want to remain in denial about the inadequacy of their product and/or the difficulty of starting a startup subconsciously prefer the broad and shallow “marketing” approach precisely because they can’t face the work and unpleasant truths they’ll find if they talk to users.

    +

    How should you measure if your manual efforts are effective? Focus on growth rate rather than absolute numbers. Then you won’t be dismayed if the absolute numbers are small at first. If you have 20 users, you only need two more this week to grow 10%. And while two users is a small number for most products, 10% a week is a great growth rate. If you keep growing at 10% a week, the absolute numbers will eventually become impressive.

    +

    Our advice at Y Combinator is always to make a really good product and go out and get users manually. The two work hand-in-hand: you need to talk individually to early adopters to make a really good product. So focusing on the narrow and deep end of the sales/marketing continuum is not just the most effective way to get users. Your startup will die if you don’t.

    +


    +

    +


    +

    + +
    + + + + +
    + + + + + + + + + + + + +
    The Accelerators HOME PAGE
    + +

    Add a Comment

    We welcome thoughtful comments from readers. Please comply with our guidelines. Our blogs do not require the use of your real name.

    Comments (5 of 23)

    View all Comments »
      • I feel that Levinson (Guerrilla Marketing) defines marketing in the most simple and true way -- anything you do to help sell a product or service. These two terms are connected at the hip...sales do not take place without effective marketing no matter what you choose to use to communicate messages. A face-to-face with early adopters, reaching out through influencers (thank you Malcolm Gladwell), or effective networking within a narrow group can all be considered marketing. Just as much as direct mail, television commercials or an Adwords campaign. You can't just go out and sell without making sure your message is clear (and differentiates you). And you can't go out with a clear message without determining what communication method will work most effectively with your target audience. In the end, we are all marketers and salespeople, and where we are on the marketing - sales continuum is determined by the moment at which you want to ask for someone to buy you/your product. Even though methodology might evolve and change what doesn't change is the need to make sure your message breaks through the marketing clutter so that the intended target says yes when you ask them to buy what you're marketing to them.

        +
      • This article and the associated comments pretty much sums up the primary problem with "marketing", that people have varying definitions. Product management concepts (4P's), outbound campaign concepts (for brand, benefits, offers, promotions, positioning, etc.), inbound (market research, iterative development, etc.), and marketing communications (internal and external, PR). All of these discrete concepts are valid and essential in running a business, and have different focus/priority depending on the stage of the company.

        +

        That said, having been involved in successful and unsuccessful startup, the point of the article is valid, that good analysis of product adoption by early users is essential to the success of a product and more important than activities that drive broad, mass awareness. This feedback loop is an extension of the iterative feedback loop found in most agile development activities, the users are just providing another feedback point, namely the willingness to spend their dollars.

        +

        While I've encountered entrepreneurs that hesitate to engage early users, these are typically the same folks that don't respond well when you tell them certain aspects of the product need to be changed during the development process. It's just their belief that they know best. Unlike Steve Jobs et al, they haven't done their time with users.

        +

        One proposal for future "marketing" articles is to clearly identify the aspect of "marketing" to be discussed. Much like Operations and Product Development, Marketing encompasses many disciplines. Unfortunately, a few of the disciplines call themselves Marketing in various organizations.

        +

        Cheers.

        +
      • It appears times have changed and not for the better for the small business owners and start-up entrepreneurs. Sales cannot be generated without marketing and the small owner, not the larger business owner has no consumption of free offers to enhance their business. I had a teaching school for 31 years with business education and trained some 250,000 business owners at a 90% success rating. Coming back from a 10 years retiring stay I have tried to re-start the same program with additional educational program and new concepts, It's been some 10 months and not a nibble on one of the best educational program ever offered into the small business world. The marketing has not gathered one cent in sales and after offering more than $500.00 per person in free services, and materials, and much more. More than 6000 contacts 150 programs and within those contacts more than 2000 personal face to face presentations, and reviewed materials. It seems I cannot reach the small business owner to make them a better owner and a more profitable and saleable business. I have created new programs, insurance guarantees, seminars guarantees, free promotional advertising, on air free promotions, and so many other business programs.

        +

        I don't know to agree, or disagree but the fact remains that the marketing is not the same as it was going back to when, in 1985 when I started. When you offer sound and sounder business program for free and no one asked for the program I am at the end of my rope and some 50 years of experience and $35 million in sales in 1989, WHO CAN OFFER THE CORRECT ANSWER TO REACH THE SMALL BUSINESS OWNER.. Thanks Tony Pezza

        +
      • It doesn't appear you understand or appreciate the definition of marketing. There are at least four essential components of marketing (some argue there are as many as six), each with considerable depth and no more important than the other components: The product itself, the place/distribution channel through which you will sell the product, the price of the product and the way in which you'll position/promote the product to drive sales.

        +

        Andrew Shea
        +Senior Marketing Executive
        +St. Louis, Missouri

        +
      • I'm not a marketing/sales person, so I won't go there. But, having founded my consulting firm three years ago, one of the most important lessons I've learned is investors and lenders are most impressed by revenue. And, one generates revenue by making sales. If your company can demonstrate that it can create yield on a small budget investors and lenders will be (1) taken with management (a critical threshold) and (2) more inclined to invest or lend funds.

        +
    + + + +
    + + + + + + +
    +
    +
    + + + +
    +
    + +
    +
    +

    About The Accelerators

    + + + +
    +
      +
    • +

      For aspiring or actual entrepreneurs, The Accelerators forum is a lively discussion among startup mentors– entrepreneurs, angel investors and venture capitalists. To reach us: @wsjstartup or theaccelerators@wsj.com.

      +
        +
      • +
        The Accelerators on Facebook +
      • +
      +
    • +
    +
    +
    +

    The Accelerators

    +
    + + + + + + + + + +
    + + + + + + + + + +
    + + + + + +
    +
    + + Liz
    Lange
    +
    +
    + + + +
    + + + + + + + + + +
    + + + + + + + +
    +
    + + Mike
    Walsh
    +
    +
    + +
    + + + + + + + + + +
    + +
    +
    + + +
    + + + +
    + + + + + + + +
    + + +
    + + + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/data/extractors/test_issue115.json b/tests/data/extractors/test_issue115.json new file mode 100644 index 00000000..8f87a9bc --- /dev/null +++ b/tests/data/extractors/test_issue115.json @@ -0,0 +1,6 @@ +{ + "url": "http://blogs.wsj.com/accelerators/2014/06/03/jessica-livingston-why-startups-need-to-focus-on-sales-not-marketing/", + "expected": { + "cleaned_text": "JESSICA LIVINGSTON: The most important thing an early-stage startup should know about marketing is rather counterintuitive: that you probably shouldn’t be doing anything you’d use the term" + } +} diff --git a/tests/extractors.py b/tests/extractors.py index 9969c059..07b9362e 100644 --- a/tests/extractors.py +++ b/tests/extractors.py @@ -365,6 +365,12 @@ def test_issue129(self): fields = ['cleaned_text'] self.runArticleAssertions(article=article, fields=fields) + def test_issue115(self): + # https://github.com/grangier/python-goose/issues/115 + article = self.getArticle() + fields = ['cleaned_text'] + self.runArticleAssertions(article=article, fields=fields) + class TestPublishDate(TestExtractionBase): From c7ec678ba411a598c0d3d0970fdbe717cae2d351 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 07:09:02 +0100 Subject: [PATCH 051/100] #115 - use known content tags to be article main body --- goose/crawler.py | 9 +++------ goose/extractors.py | 27 +++++++++++++++++++-------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/goose/crawler.py b/goose/crawler.py index 43aaf4ea..6b205916 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -108,10 +108,10 @@ def crawl(self, crawl_candidate): self.article.authors = self.extractor.extract_authors() self.article.title = self.extractor.get_title() - # check for an articleBody - # if we find one force the article.doc to be the articleBody node + # check for known node as content body + # if we find one force the article.doc to be the found node # this will prevent the cleaner to remove unwanted text content - article_body = self.extractor.get_articlebody() + article_body = self.extractor.get_known_article_tags() if article_body is not None: self.article.doc = article_body @@ -141,9 +141,6 @@ def crawl(self, crawl_candidate): # post cleanup self.article.top_node = self.extractor.post_cleanup() - # article links - self.article.links = self.extractor.extract_links() - # clean_text self.article.cleaned_text = self.formatter.get_formatted_text() diff --git a/goose/extractors.py b/goose/extractors.py index 07f5321f..3376df7d 100644 --- a/goose/extractors.py +++ b/goose/extractors.py @@ -40,6 +40,7 @@ A_REL_TAG_SELECTOR = "a[rel=tag]" A_HREF_TAG_SELECTOR = "a[href*='/tag/'], a[href*='/tags/'], a[href*='/topic/'], a[href*='?keyword=']" RE_LANG = r'^[A-Za-z]{2}$' + KNOWN_PUBLISH_DATE_TAGS = [ {'attribute': 'property', 'value': 'rnews:datePublished', 'content': 'content'}, {'attribute': 'property', 'value': 'article:published_time', 'content': 'content'}, @@ -47,6 +48,11 @@ {'attribute': 'itemprop', 'value': 'datePublished', 'content': 'datetime'}, ] +KNOWN_ARTICLE_CONTENT_TAGS = [ + {'attr': 'itemprop', 'value': 'articleBody'}, + {'attr': 'class', 'value': 'post-content'}, +] + class ContentExtractor(object): @@ -249,6 +255,16 @@ def get_domain(self): return o.hostname return None + def get_known_article_tags(self): + for item in KNOWN_ARTICLE_CONTENT_TAGS: + nodes = self.parser.getElementsByTag( + self.article.doc, + attr=item['attr'], + value=item['value']) + if len(nodes): + return nodes[0] + return None + def get_articlebody(self): article_body = self.parser.getElementsByTag( self.article.doc, @@ -261,8 +277,9 @@ def get_articlebody(self): return None def is_articlebody(self, node): - if self.parser.getAttribute(node, 'itemprop') == 'articleBody': - return True + for item in KNOWN_ARTICLE_CONTENT_TAGS: + if self.parser.getAttribute(node, item['attr']) == item['value']: + return True return False def extract_opengraph(self): @@ -594,12 +611,6 @@ def nodes_to_check(self, doc): """ nodes_to_check = [] - # microdata - # set the most score to articleBody node - article_body_node = self.get_articlebody() - if article_body_node is not None: - self.update_score(article_body_node, 99) - for tag in ['p', 'pre', 'td']: items = self.parser.getElementsByTag(doc, tag=tag) nodes_to_check += items From 0e6201d1acf08cd8cc392bbfcc545d95a51b45e8 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 07:30:47 +0100 Subject: [PATCH 052/100] #81 - use correct language for stopwords file --- goose/extractors.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/goose/extractors.py b/goose/extractors.py index 3376df7d..e4efcbae 100644 --- a/goose/extractors.py +++ b/goose/extractors.py @@ -66,12 +66,21 @@ def __init__(self, config, article): # article self.article = article - # language - self.language = config.target_language - # stopwords class self.stopwords_class = config.stopwords_class + def get_language(self): + """\ + Returns the language is by the article or + the configuration language + """ + # we don't want to force the target language + # so we use the article.meta_lang + if self.config.use_meta_language: + if self.article.meta_lang: + return self.article.meta_lang[:2] + return self.config.target_language + def clean_title(self, title): """Clean title with the use of og:site_name in this case try to get ride of site name @@ -371,7 +380,7 @@ def calculate_best_node(self): for node in nodes_to_check: text_node = self.parser.getText(node) - word_stats = self.stopwords_class(language=self.language).get_stopword_count(text_node) + word_stats = self.stopwords_class(language=self.get_language()).get_stopword_count(text_node) high_link_density = self.is_highlink_density(node) if word_stats.get_stopword_count() > 2 and not high_link_density: nodes_with_text.append(node) @@ -397,7 +406,7 @@ def calculate_best_node(self): boost_score = float(5) text_node = self.parser.getText(node) - word_stats = self.stopwords_class(language=self.language).get_stopword_count(text_node) + word_stats = self.stopwords_class(language=self.get_language()).get_stopword_count(text_node) upscore = int(word_stats.get_stopword_count() + boost_score) # parent node @@ -453,7 +462,7 @@ def is_boostable(self, node): if steps_away >= max_stepsaway_from_node: return False paraText = self.parser.getText(current_node) - word_stats = self.stopwords_class(language=self.language).get_stopword_count(paraText) + word_stats = self.stopwords_class(language=self.get_language()).get_stopword_count(paraText) if word_stats.get_stopword_count() > minimum_stopword_count: return True steps_away += 1 @@ -500,7 +509,7 @@ def get_siblings_content(self, current_sibling, baselinescore_siblings_para): for first_paragraph in potential_paragraphs: text = self.parser.getText(first_paragraph) if len(text) > 0: - word_stats = self.stopwords_class(language=self.language).get_stopword_count(text) + word_stats = self.stopwords_class(language=self.get_language()).get_stopword_count(text) paragraph_score = word_stats.get_stopword_count() sibling_baseline_score = float(.30) high_link_density = self.is_highlink_density(first_paragraph) @@ -527,7 +536,7 @@ def get_siblings_score(self, top_node): for node in nodes_to_check: text_node = self.parser.getText(node) - word_stats = self.stopwords_class(language=self.language).get_stopword_count(text_node) + word_stats = self.stopwords_class(language=self.get_language()).get_stopword_count(text_node) high_link_density = self.is_highlink_density(node) if word_stats.get_stopword_count() > 2 and not high_link_density: paragraphs_number += 1 From 4632df70b7b79f70e0c37bd2f1e9c71f92a0b6eb Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 07:36:15 +0100 Subject: [PATCH 053/100] #182 - rename soup parser --- goose/configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/configuration.py b/goose/configuration.py index fe26b22a..cc99e222 100644 --- a/goose/configuration.py +++ b/goose/configuration.py @@ -31,7 +31,7 @@ AVAILABLE_PARSERS = { 'lxml' : Parser, - 'soupparser': ParserSoup, + 'soup': ParserSoup, } From fe5f5e9dad84ed775854a93809f1fab89e4f527c Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 07:47:52 +0100 Subject: [PATCH 054/100] #183 - pep8 --- goose/configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/configuration.py b/goose/configuration.py index cc99e222..fcfa5b9a 100644 --- a/goose/configuration.py +++ b/goose/configuration.py @@ -30,7 +30,7 @@ HTTP_DEFAULT_TIMEOUT = 30 AVAILABLE_PARSERS = { - 'lxml' : Parser, + 'lxml': Parser, 'soup': ParserSoup, } From 22ded4b3126e878f7608bd4a5b72c9a4c8daf1fe Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 08:04:47 +0100 Subject: [PATCH 055/100] #183 - use article tag for a top node --- goose/extractors.py | 26 ++++++++----------- tests/data/extractors/test_articlebody.json | 6 ----- .../test_articlebody_attribute.html | 15 +++++++++++ .../test_articlebody_attribute.json | 6 +++++ ...dy.html => test_articlebody_itemprop.html} | 0 .../extractors/test_articlebody_itemprop.json | 6 +++++ .../data/extractors/test_articlebody_tag.html | 15 +++++++++++ .../data/extractors/test_articlebody_tag.json | 6 +++++ tests/extractors.py | 23 ++++++++++++---- 9 files changed, 77 insertions(+), 26 deletions(-) delete mode 100644 tests/data/extractors/test_articlebody.json create mode 100644 tests/data/extractors/test_articlebody_attribute.html create mode 100644 tests/data/extractors/test_articlebody_attribute.json rename tests/data/extractors/{test_articlebody.html => test_articlebody_itemprop.html} (100%) create mode 100644 tests/data/extractors/test_articlebody_itemprop.json create mode 100644 tests/data/extractors/test_articlebody_tag.html create mode 100644 tests/data/extractors/test_articlebody_tag.json diff --git a/goose/extractors.py b/goose/extractors.py index e4efcbae..9440aea6 100644 --- a/goose/extractors.py +++ b/goose/extractors.py @@ -51,6 +51,7 @@ KNOWN_ARTICLE_CONTENT_TAGS = [ {'attr': 'itemprop', 'value': 'articleBody'}, {'attr': 'class', 'value': 'post-content'}, + {'tag': 'article'}, ] @@ -268,27 +269,22 @@ def get_known_article_tags(self): for item in KNOWN_ARTICLE_CONTENT_TAGS: nodes = self.parser.getElementsByTag( self.article.doc, - attr=item['attr'], - value=item['value']) + **item) if len(nodes): return nodes[0] return None - def get_articlebody(self): - article_body = self.parser.getElementsByTag( - self.article.doc, - attr='itemprop', - value='articleBody') - if len(article_body): - article_body = article_body[0] - self.parser.setAttribute(article_body, "extraction", "microDataExtration") - return article_body - return None - def is_articlebody(self, node): for item in KNOWN_ARTICLE_CONTENT_TAGS: - if self.parser.getAttribute(node, item['attr']) == item['value']: - return True + # attribute + if "attr" in item and "value" in item: + if self.parser.getAttribute(node, item['attr']) == item['value']: + return True + # tag + if "tag" in item: + if node.tag == item['tag']: + return True + return False def extract_opengraph(self): diff --git a/tests/data/extractors/test_articlebody.json b/tests/data/extractors/test_articlebody.json deleted file mode 100644 index a775091d..00000000 --- a/tests/data/extractors/test_articlebody.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "url": "http://exemple.com/test_opengraphcontent", - "expected": { - "cleaned_text": "Search-and-rescue teams were mobilized from across Southeast Asia on Sunday after a commercial airliner with 162 people on board lost contact with ground controllers off the coast of Borneo, a search effort that evoked a distressingly familiar mix of grief and mystery nine months after a Malaysia Airlines jetliner disappeared over the Indian Ocean." - } -} diff --git a/tests/data/extractors/test_articlebody_attribute.html b/tests/data/extractors/test_articlebody_attribute.html new file mode 100644 index 00000000..bbf00f65 --- /dev/null +++ b/tests/data/extractors/test_articlebody_attribute.html @@ -0,0 +1,15 @@ + + +
    +

    + Not an Actual Content + TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

    +
    +
    +

    + Search-and-rescue teams were mobilized from across Southeast Asia on Sunday after a commercial airliner with 162 people on board lost contact with ground controllers off the coast of Borneo, a search effort that evoked a distressingly familiar mix of grief and mystery nine months after a Malaysia Airlines jetliner disappeared over the Indian Ocean. +

    +
    + + diff --git a/tests/data/extractors/test_articlebody_attribute.json b/tests/data/extractors/test_articlebody_attribute.json new file mode 100644 index 00000000..7fbebcaf --- /dev/null +++ b/tests/data/extractors/test_articlebody_attribute.json @@ -0,0 +1,6 @@ +{ + "url": "http://exemple.com/test_opengraphcontent", + "expected": { + "cleaned_text": "Search-and-rescue teams were mobilized " + } +} diff --git a/tests/data/extractors/test_articlebody.html b/tests/data/extractors/test_articlebody_itemprop.html similarity index 100% rename from tests/data/extractors/test_articlebody.html rename to tests/data/extractors/test_articlebody_itemprop.html diff --git a/tests/data/extractors/test_articlebody_itemprop.json b/tests/data/extractors/test_articlebody_itemprop.json new file mode 100644 index 00000000..7fbebcaf --- /dev/null +++ b/tests/data/extractors/test_articlebody_itemprop.json @@ -0,0 +1,6 @@ +{ + "url": "http://exemple.com/test_opengraphcontent", + "expected": { + "cleaned_text": "Search-and-rescue teams were mobilized " + } +} diff --git a/tests/data/extractors/test_articlebody_tag.html b/tests/data/extractors/test_articlebody_tag.html new file mode 100644 index 00000000..6e7ca4be --- /dev/null +++ b/tests/data/extractors/test_articlebody_tag.html @@ -0,0 +1,15 @@ + + +
    +

    + Not an Actual Content + TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

    +
    +
    +

    + Search-and-rescue teams were mobilized from across Southeast Asia on Sunday after a commercial airliner with 162 people on board lost contact with ground controllers off the coast of Borneo, a search effort that evoked a distressingly familiar mix of grief and mystery nine months after a Malaysia Airlines jetliner disappeared over the Indian Ocean. +

    +
    + + diff --git a/tests/data/extractors/test_articlebody_tag.json b/tests/data/extractors/test_articlebody_tag.json new file mode 100644 index 00000000..7fbebcaf --- /dev/null +++ b/tests/data/extractors/test_articlebody_tag.json @@ -0,0 +1,6 @@ +{ + "url": "http://exemple.com/test_opengraphcontent", + "expected": { + "cleaned_text": "Search-and-rescue teams were mobilized " + } +} diff --git a/tests/extractors.py b/tests/extractors.py index 07b9362e..b9496b8c 100644 --- a/tests/extractors.py +++ b/tests/extractors.py @@ -345,11 +345,6 @@ def test_okaymarketing(self): fields = ['cleaned_text'] self.runArticleAssertions(article=article, fields=fields) - def test_articlebody(self): - article = self.getArticle() - fields = ['cleaned_text'] - self.runArticleAssertions(article=article, fields=fields) - def test_opengraph(self): article = self.getArticle() fields = ['opengraph'] @@ -372,6 +367,24 @@ def test_issue115(self): self.runArticleAssertions(article=article, fields=fields) +class TestArticleTopNode(TestExtractionBase): + + def test_articlebody_itemprop(self): + article = self.getArticle() + fields = ['cleaned_text'] + self.runArticleAssertions(article=article, fields=fields) + + def test_articlebody_attribute(self): + article = self.getArticle() + fields = ['cleaned_text'] + self.runArticleAssertions(article=article, fields=fields) + + def test_articlebody_tag(self): + article = self.getArticle() + fields = ['cleaned_text'] + self.runArticleAssertions(article=article, fields=fields) + + class TestPublishDate(TestExtractionBase): def test_publish_date(self): From 57b1534284e6404d6009ca64968bfaec2e9148a3 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 08:31:00 +0100 Subject: [PATCH 056/100] #185 - movies info --- goose/article.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/goose/article.py b/goose/article.py index d4885616..e3f522f5 100644 --- a/goose/article.py +++ b/goose/article.py @@ -124,7 +124,7 @@ def infos(self): "opengraph": self.opengraph, "tags": self.tags, "tweets": self.tweets, - "movies": self.movies, + "movies": [], "links": self.links, "authors": self.authors, "publish_date": self.publish_date @@ -132,6 +132,22 @@ def infos(self): # image if self.top_image is not None: - data['image'] = self.top_image.src + data['image'] = { + 'url': self.top_image.src, + 'width': self.top_image.width, + 'height': self.top_image.height, + 'type': 'image' + } + + # movies + for movie in self.movies: + data['movies'].append({ + 'embed_type': movie.embed_type, + 'provider': movie.provider, + 'width': movie.width, + 'height': movie.height, + 'embed_code': movie.embed_code, + 'src': movie.src, + }) return data From 4eda345d202797820c81f8d6af510298289cc004 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 08:33:45 +0100 Subject: [PATCH 057/100] bump version --- goose/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/version.py b/goose/version.py index 43693f9c..875065c7 100644 --- a/goose/version.py +++ b/goose/version.py @@ -21,5 +21,5 @@ limitations under the License. """ -version_info = (1, 0, 22) +version_info = (1, 0, 23) __version__ = ".".join(map(str, version_info)) From dd33aab172138266c90940ee9bdcef1712996a22 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 08:42:06 +0100 Subject: [PATCH 058/100] ignore egg files --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index bea68953..4bfadf57 100644 --- a/.gitignore +++ b/.gitignore @@ -8,5 +8,6 @@ env/ *~ .idea ._* +*.egg venv/ goose_extractor.egg-info/ From 3ebc97cffa40298bcad87e9113b42cb0b1704940 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Tue, 30 Dec 2014 09:04:51 +0100 Subject: [PATCH 059/100] #187 - empty list --- goose/extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/extractors.py b/goose/extractors.py index 9440aea6..10e32a21 100644 --- a/goose/extractors.py +++ b/goose/extractors.py @@ -36,7 +36,7 @@ ARROWS_SPLITTER = StringSplitter("»") COLON_SPLITTER = StringSplitter(":") SPACE_SPLITTER = StringSplitter(' ') -NO_STRINGS = set() +NO_STRINGS = [] A_REL_TAG_SELECTOR = "a[rel=tag]" A_HREF_TAG_SELECTOR = "a[href*='/tag/'], a[href*='/tags/'], a[href*='/topic/'], a[href*='?keyword=']" RE_LANG = r'^[A-Za-z]{2}$' From 8eccabf1338fd653cad6f2934db86ceed4492910 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 02:07:05 +0100 Subject: [PATCH 060/100] #188 - mv article extractor to extractors directory --- goose/{extractors.py => extractors/content.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename goose/{extractors.py => extractors/content.py} (100%) diff --git a/goose/extractors.py b/goose/extractors/content.py similarity index 100% rename from goose/extractors.py rename to goose/extractors/content.py From 731f104dbcd80b38176e7a073be90a59f10240a3 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 02:10:47 +0100 Subject: [PATCH 061/100] #188 - create a base extractor class --- goose/extractors/__init__.py | 38 ++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 goose/extractors/__init__.py diff --git a/goose/extractors/__init__.py b/goose/extractors/__init__.py new file mode 100644 index 00000000..5554efbc --- /dev/null +++ b/goose/extractors/__init__.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +class BaseExtractor(object): + + def __init__(self, config, article): + # config + self.config = config + + # parser + self.parser = self.config.get_parser() + + # article + self.article = article + + # stopwords class + self.stopwords_class = config.stopwords_class From 6ef3f68e29a78e5f4718d6191ef58bf962e600a3 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 02:12:02 +0100 Subject: [PATCH 062/100] #188 - contentextractor inherits form baseextractor --- goose/extractors/content.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/goose/extractors/content.py b/goose/extractors/content.py index 10e32a21..0053f3f1 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -23,6 +23,7 @@ import re from copy import deepcopy from urlparse import urlparse, urljoin +from goose.extractors import BaseExtractor from goose.utils import StringSplitter from goose.utils import StringReplacement from goose.utils import ReplaceSequence @@ -55,20 +56,7 @@ ] -class ContentExtractor(object): - - def __init__(self, config, article): - # config - self.config = config - - # parser - self.parser = self.config.get_parser() - - # article - self.article = article - - # stopwords class - self.stopwords_class = config.stopwords_class +class ContentExtractor(BaseExtractor): def get_language(self): """\ From bcf4654867b4915fe926613406208e78db8fb443 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 02:14:18 +0100 Subject: [PATCH 063/100] #188 - create specific extractors classes --- goose/extractors/meta.py | 28 ++++++++++++++++++++++++++++ goose/extractors/opengraph.py | 28 ++++++++++++++++++++++++++++ goose/extractors/publishdate.py | 28 ++++++++++++++++++++++++++++ goose/extractors/title.py | 28 ++++++++++++++++++++++++++++ 4 files changed, 112 insertions(+) create mode 100644 goose/extractors/meta.py create mode 100644 goose/extractors/opengraph.py create mode 100644 goose/extractors/publishdate.py create mode 100644 goose/extractors/title.py diff --git a/goose/extractors/meta.py b/goose/extractors/meta.py new file mode 100644 index 00000000..7a92df21 --- /dev/null +++ b/goose/extractors/meta.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from goose.extractors import BaseExtractor + + +class ContentMetaExtractor(BaseExtractor): + pass diff --git a/goose/extractors/opengraph.py b/goose/extractors/opengraph.py new file mode 100644 index 00000000..ee916b82 --- /dev/null +++ b/goose/extractors/opengraph.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from goose.extractors import BaseExtractor + + +class ContentOpenGraphExtractor(BaseExtractor): + pass diff --git a/goose/extractors/publishdate.py b/goose/extractors/publishdate.py new file mode 100644 index 00000000..7ea1635a --- /dev/null +++ b/goose/extractors/publishdate.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from goose.extractors import BaseExtractor + + +class ContentPublishDateExtractor(BaseExtractor): + pass diff --git a/goose/extractors/title.py b/goose/extractors/title.py new file mode 100644 index 00000000..1afdb37e --- /dev/null +++ b/goose/extractors/title.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from goose.extractors import BaseExtractor + + +class ContentTitleExtractor(BaseExtractor): + pass From cbbfba3c3544439675fac8b2d0de49b23530cf01 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 02:17:59 +0100 Subject: [PATCH 064/100] #188 - add tags and author extractors --- goose/extractors/author.py | 28 ++++++++++++++++++++++++++++ goose/extractors/tags.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 goose/extractors/author.py create mode 100644 goose/extractors/tags.py diff --git a/goose/extractors/author.py b/goose/extractors/author.py new file mode 100644 index 00000000..bc18925a --- /dev/null +++ b/goose/extractors/author.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from goose.extractors import BaseExtractor + + +class ContentAuthorExtractor(BaseExtractor): + pass diff --git a/goose/extractors/tags.py b/goose/extractors/tags.py new file mode 100644 index 00000000..28f835ef --- /dev/null +++ b/goose/extractors/tags.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from goose.extractors import BaseExtractor + + +class ContentTagsExtractor(BaseExtractor): + pass From a957931ce9df9daf65007a5314f4e42521e733eb Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 02:20:28 +0100 Subject: [PATCH 065/100] #188 - correct import --- goose/crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/crawler.py b/goose/crawler.py index 0c0eaa8f..cc25c5e9 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -25,7 +25,7 @@ from copy import deepcopy from goose.article import Article from goose.utils import URLHelper, RawHelper -from goose.extractors import StandardContentExtractor +from goose.extractors.content import StandardContentExtractor from goose.cleaners import StandardDocumentCleaner from goose.outputformatters import StandardOutputFormatter from goose.images.extractors import UpgradedImageIExtractor From 8d6d49e94e8de50b692c10631e9d649948c944e8 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 02:31:03 +0100 Subject: [PATCH 066/100] #188 - move video to extractor directory --- goose/crawler.py | 3 ++- .../{videos/extractors.py => extractors/videos.py} | 14 ++++---------- goose/{videos => }/videos.py | 0 goose/videos/__init__.py | 0 4 files changed, 6 insertions(+), 11 deletions(-) rename goose/{videos/extractors.py => extractors/videos.py} (95%) rename goose/{videos => }/videos.py (100%) delete mode 100644 goose/videos/__init__.py diff --git a/goose/crawler.py b/goose/crawler.py index cc25c5e9..e77f9218 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -26,10 +26,11 @@ from goose.article import Article from goose.utils import URLHelper, RawHelper from goose.extractors.content import StandardContentExtractor +from goose.extractors.videos import VideoExtractor from goose.cleaners import StandardDocumentCleaner from goose.outputformatters import StandardOutputFormatter from goose.images.extractors import UpgradedImageIExtractor -from goose.videos.extractors import VideoExtractor + from goose.network import HtmlFetcher diff --git a/goose/videos/extractors.py b/goose/extractors/videos.py similarity index 95% rename from goose/videos/extractors.py rename to goose/extractors/videos.py index 71c52895..569b5f15 100644 --- a/goose/videos/extractors.py +++ b/goose/extractors/videos.py @@ -21,25 +21,19 @@ limitations under the License. """ -from goose.videos.videos import Video +from goose.extractors import BaseExtractor +from goose.videos import Video VIDEOS_TAGS = ['iframe', 'embed', 'object', 'video'] VIDEO_PROVIDERS = ['youtube', 'vimeo', 'dailymotion', 'kewego'] -class VideoExtractor(object): +class VideoExtractor(BaseExtractor): """\ Extracts a list of video from Article top node """ def __init__(self, config, article): - # article - self.article = article - - # config - self.config = config - - # parser - self.parser = self.config.get_parser() + super(VideoExtractor, self).__init__(config, article) # candidates self.candidates = [] diff --git a/goose/videos/videos.py b/goose/videos.py similarity index 100% rename from goose/videos/videos.py rename to goose/videos.py diff --git a/goose/videos/__init__.py b/goose/videos/__init__.py deleted file mode 100644 index e69de29b..00000000 From ab81954745cf57e172c7b31d1bf4eeb314a34959 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 02:41:22 +0100 Subject: [PATCH 067/100] #188 - move images extractor to extractors dir and correct videos --- goose/crawler.py | 2 +- goose/{images/extractors.py => extractors/images.py} | 4 ++-- goose/extractors/videos.py | 2 +- goose/{images => }/image.py | 0 goose/images/__init__.py | 0 goose/{images/utils.py => utils/images.py} | 4 ++-- goose/{videos.py => video.py} | 0 tests/images.py | 7 ++++--- 8 files changed, 10 insertions(+), 9 deletions(-) rename goose/{images/extractors.py => extractors/images.py} (99%) rename goose/{images => }/image.py (100%) delete mode 100644 goose/images/__init__.py rename goose/{images/utils.py => utils/images.py} (97%) rename goose/{videos.py => video.py} (100%) diff --git a/goose/crawler.py b/goose/crawler.py index e77f9218..4f360822 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -27,9 +27,9 @@ from goose.utils import URLHelper, RawHelper from goose.extractors.content import StandardContentExtractor from goose.extractors.videos import VideoExtractor +from goose.extractors.images import UpgradedImageIExtractor from goose.cleaners import StandardDocumentCleaner from goose.outputformatters import StandardOutputFormatter -from goose.images.extractors import UpgradedImageIExtractor from goose.network import HtmlFetcher diff --git a/goose/images/extractors.py b/goose/extractors/images.py similarity index 99% rename from goose/images/extractors.py rename to goose/extractors/images.py index 4372ae8c..1cf9af09 100644 --- a/goose/images/extractors.py +++ b/goose/extractors/images.py @@ -24,8 +24,8 @@ import os from urlparse import urlparse, urljoin from goose.utils import FileHelper -from goose.images.image import Image -from goose.images.utils import ImageUtils +from goose.image import Image +from goose.utils.images import ImageUtils KNOWN_IMG_DOM_NAMES = [ "yn-story-related-media", diff --git a/goose/extractors/videos.py b/goose/extractors/videos.py index 569b5f15..88fdf20d 100644 --- a/goose/extractors/videos.py +++ b/goose/extractors/videos.py @@ -22,7 +22,7 @@ """ from goose.extractors import BaseExtractor -from goose.videos import Video +from goose.video import Video VIDEOS_TAGS = ['iframe', 'embed', 'object', 'video'] VIDEO_PROVIDERS = ['youtube', 'vimeo', 'dailymotion', 'kewego'] diff --git a/goose/images/image.py b/goose/image.py similarity index 100% rename from goose/images/image.py rename to goose/image.py diff --git a/goose/images/__init__.py b/goose/images/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/goose/images/utils.py b/goose/utils/images.py similarity index 97% rename from goose/images/utils.py rename to goose/utils/images.py index daf5eddb..388d5c85 100644 --- a/goose/images/utils.py +++ b/goose/utils/images.py @@ -25,8 +25,8 @@ import urllib2 from PIL import Image from goose.utils.encoding import smart_str -from goose.images.image import ImageDetails -from goose.images.image import LocallyStoredImage +from goose.image import ImageDetails +from goose.image import LocallyStoredImage class ImageUtils(object): diff --git a/goose/videos.py b/goose/video.py similarity index 100% rename from goose/videos.py rename to goose/video.py diff --git a/tests/images.py b/tests/images.py index e0fc2d08..ace6d323 100644 --- a/tests/images.py +++ b/tests/images.py @@ -29,10 +29,11 @@ from extractors import TestExtractionBase from goose.configuration import Configuration -from goose.images.image import Image -from goose.images.image import ImageDetails -from goose.images.utils import ImageUtils +from goose.image import Image +from goose.image import ImageDetails from goose.utils import FileHelper +from goose.utils.images import ImageUtils + CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) From 9597fe152b45b685862b746ff5acbb71f89c05e9 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 02:45:59 +0100 Subject: [PATCH 068/100] #188 - rename UpgradedImageIExtractor to ImageExtractor --- goose/crawler.py | 4 ++-- goose/extractors/images.py | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/goose/crawler.py b/goose/crawler.py index 4f360822..4d2518ac 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -27,7 +27,7 @@ from goose.utils import URLHelper, RawHelper from goose.extractors.content import StandardContentExtractor from goose.extractors.videos import VideoExtractor -from goose.extractors.images import UpgradedImageIExtractor +from goose.extractors.images import ImageExtractor from goose.cleaners import StandardDocumentCleaner from goose.outputformatters import StandardOutputFormatter @@ -176,7 +176,7 @@ def get_html(self, crawl_candidate, parsing_candidate): return html def get_image_extractor(self): - return UpgradedImageIExtractor(self.config, self.article) + return ImageExtractor(self.config, self.article) def get_video_extractor(self): return VideoExtractor(self.config, self.article) diff --git a/goose/extractors/images.py b/goose/extractors/images.py index 1cf9af09..b3396b32 100644 --- a/goose/extractors/images.py +++ b/goose/extractors/images.py @@ -44,10 +44,6 @@ def __init__(self, node, parent_depth, sibling_depth): class ImageExtractor(object): - pass - - -class UpgradedImageIExtractor(ImageExtractor): def __init__(self, config, article): self.custom_site_mapping = {} From a5e96e74ca387fdd8996a602c32e05bd33fb4339 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 02:50:40 +0100 Subject: [PATCH 069/100] #188 - ImageExtractor extends from BaseExtractor --- goose/extractors/images.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/goose/extractors/images.py b/goose/extractors/images.py index b3396b32..3af44f5f 100644 --- a/goose/extractors/images.py +++ b/goose/extractors/images.py @@ -22,9 +22,12 @@ """ import re import os + from urlparse import urlparse, urljoin -from goose.utils import FileHelper + +from goose.extractors import BaseExtractor from goose.image import Image +from goose.utils import FileHelper from goose.utils.images import ImageUtils KNOWN_IMG_DOM_NAMES = [ @@ -43,20 +46,14 @@ def __init__(self, node, parent_depth, sibling_depth): self.sibling_depth = sibling_depth -class ImageExtractor(object): +class ImageExtractor(BaseExtractor): def __init__(self, config, article): - self.custom_site_mapping = {} - self.load_customesite_mapping() - - # article - self.article = article + super(ImageExtractor, self).__init__(config, article) - # config - self.config = config + self.custom_site_mapping = {} - # parser - self.parser = self.config.get_parser() + self.load_customesite_mapping() # What's the minimum bytes for an image we'd accept is self.images_min_bytes = 4000 From 0492fb8509f85c8e3c28c1e07b1ffcbe15c8952c Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 03:02:45 +0100 Subject: [PATCH 070/100] #188 - move title extractor from content to title extractor class --- goose/crawler.py | 9 +++- goose/extractors/content.py | 100 ++---------------------------------- goose/extractors/title.py | 75 ++++++++++++++++++++++++++- goose/utils/__init__.py | 13 ----- 4 files changed, 84 insertions(+), 113 deletions(-) diff --git a/goose/crawler.py b/goose/crawler.py index 4d2518ac..e059e902 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -27,6 +27,7 @@ from goose.utils import URLHelper, RawHelper from goose.extractors.content import StandardContentExtractor from goose.extractors.videos import VideoExtractor +from goose.extractors.title import TitleExtractor from goose.extractors.images import ImageExtractor from goose.cleaners import StandardDocumentCleaner from goose.outputformatters import StandardOutputFormatter @@ -67,6 +68,9 @@ def __init__(self, config): # video extractor self.video_extractor = self.get_video_extractor() + # title extractor + self.title_extractor = self.get_title_extractor() + # image extrator self.image_extractor = self.get_image_extractor() @@ -107,7 +111,7 @@ def crawl(self, crawl_candidate): self.article.domain = self.extractor.get_domain() self.article.tags = self.extractor.extract_tags() self.article.authors = self.extractor.extract_authors() - self.article.title = self.extractor.get_title() + self.article.title = self.title_extractor.extract() # check for known node as content body # if we find one force the article.doc to be the found node @@ -175,6 +179,9 @@ def get_html(self, crawl_candidate, parsing_candidate): }) return html + def get_title_extractor(self): + return TitleExtractor(self.config, self.article) + def get_image_extractor(self): return ImageExtractor(self.config, self.article) diff --git a/goose/extractors/content.py b/goose/extractors/content.py index 0053f3f1..c580e844 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -21,22 +21,12 @@ limitations under the License. """ import re + from copy import deepcopy from urlparse import urlparse, urljoin + from goose.extractors import BaseExtractor -from goose.utils import StringSplitter -from goose.utils import StringReplacement -from goose.utils import ReplaceSequence - -MOTLEY_REPLACEMENT = StringReplacement("�", "") -ESCAPED_FRAGMENT_REPLACEMENT = StringReplacement(u"#!", u"?_escaped_fragment_=") -TITLE_REPLACEMENTS = ReplaceSequence().create(u"»").append(u"»") -TITLE_SPLITTERS = [u"|", u"-", u"»", u":"] -PIPE_SPLITTER = StringSplitter("\\|") -DASH_SPLITTER = StringSplitter(" - ") -ARROWS_SPLITTER = StringSplitter("»") -COLON_SPLITTER = StringSplitter(":") -SPACE_SPLITTER = StringSplitter(' ') + NO_STRINGS = [] A_REL_TAG_SELECTOR = "a[rel=tag]" A_HREF_TAG_SELECTOR = "a[href*='/tag/'], a[href*='/tags/'], a[href*='/topic/'], a[href*='?keyword=']" @@ -70,90 +60,6 @@ def get_language(self): return self.article.meta_lang[:2] return self.config.target_language - def clean_title(self, title): - """Clean title with the use of og:site_name - in this case try to get ride of site name - and use TITLE_SPLITTERS to reformat title - """ - # check if we have the site name in opengraph data - if "site_name" in self.article.opengraph.keys(): - site_name = self.article.opengraph['site_name'] - # remove the site name from title - title = title.replace(site_name, '').strip() - - # try to remove the domain from url - if self.article.domain: - pattern = re.compile(self.article.domain, re.IGNORECASE) - title = pattern.sub("", title).strip() - - # split the title in words - # TechCrunch | my wonderfull article - # my wonderfull article | TechCrunch - title_words = title.split() - - # check if first letter is in TITLE_SPLITTERS - # if so remove it - if title_words[0] in TITLE_SPLITTERS: - title_words.pop(0) - - # check if last letter is in TITLE_SPLITTERS - # if so remove it - if title_words[-1] in TITLE_SPLITTERS: - title_words.pop(-1) - - # rebuild the title - title = u" ".join(title_words).strip() - - return title - - def get_title(self): - """\ - Fetch the article title and analyze it - """ - title = '' - - # rely on opengraph in case we have the data - if "title" in self.article.opengraph.keys(): - title = self.article.opengraph['title'] - return self.clean_title(title) - - # try to fetch the meta headline - meta_headline = self.parser.getElementsByTag( - self.article.doc, - tag="meta", - attr="name", - value="headline") - if meta_headline is not None and len(meta_headline) > 0: - title = self.parser.getAttribute(meta_headline[0], 'content') - return self.clean_title(title) - - # otherwise use the title meta - title_element = self.parser.getElementsByTag(self.article.doc, tag='title') - if title_element is not None and len(title_element) > 0: - title = self.parser.getText(title_element[0]) - return self.clean_title(title) - - return title - - def split_title(self, title, splitter): - """\ - Split the title to best part possible - """ - large_text_length = 0 - large_text_index = 0 - title_pieces = splitter.split(title) - - # find the largest title piece - for i in range(len(title_pieces)): - current = title_pieces[i] - if len(current) > large_text_length: - large_text_length = len(current) - large_text_index = i - - # replace content - title = title_pieces[large_text_index] - return TITLE_REPLACEMENTS.replaceAll(title).strip() - def get_publish_date(self): for known_meta_tag in KNOWN_PUBLISH_DATE_TAGS: meta_tags = self.parser.getElementsByTag(self.article.doc, diff --git a/goose/extractors/title.py b/goose/extractors/title.py index 1afdb37e..8104c52b 100644 --- a/goose/extractors/title.py +++ b/goose/extractors/title.py @@ -20,9 +20,80 @@ See the License for the specific language governing permissions and limitations under the License. """ +import re from goose.extractors import BaseExtractor -class ContentTitleExtractor(BaseExtractor): - pass +TITLE_SPLITTERS = [u"|", u"-", u"»", u":"] + + +class TitleExtractor(BaseExtractor): + + def clean_title(self, title): + """Clean title with the use of og:site_name + in this case try to get ride of site name + and use TITLE_SPLITTERS to reformat title + """ + # check if we have the site name in opengraph data + if "site_name" in self.article.opengraph.keys(): + site_name = self.article.opengraph['site_name'] + # remove the site name from title + title = title.replace(site_name, '').strip() + + # try to remove the domain from url + if self.article.domain: + pattern = re.compile(self.article.domain, re.IGNORECASE) + title = pattern.sub("", title).strip() + + # split the title in words + # TechCrunch | my wonderfull article + # my wonderfull article | TechCrunch + title_words = title.split() + + # check if first letter is in TITLE_SPLITTERS + # if so remove it + if title_words[0] in TITLE_SPLITTERS: + title_words.pop(0) + + # check if last letter is in TITLE_SPLITTERS + # if so remove it + if title_words[-1] in TITLE_SPLITTERS: + title_words.pop(-1) + + # rebuild the title + title = u" ".join(title_words).strip() + + return title + + def get_title(self): + """\ + Fetch the article title and analyze it + """ + title = '' + + # rely on opengraph in case we have the data + if "title" in self.article.opengraph.keys(): + title = self.article.opengraph['title'] + return self.clean_title(title) + + # try to fetch the meta headline + meta_headline = self.parser.getElementsByTag( + self.article.doc, + tag="meta", + attr="name", + value="headline") + if meta_headline is not None and len(meta_headline) > 0: + title = self.parser.getAttribute(meta_headline[0], 'content') + return self.clean_title(title) + + # otherwise use the title meta + title_element = self.parser.getElementsByTag(self.article.doc, tag='title') + if title_element is not None and len(title_element) > 0: + title = self.parser.getText(title_element[0]) + return self.clean_title(title) + + return title + + def extract(self): + return self.get_title() diff --git a/goose/utils/__init__.py b/goose/utils/__init__.py index a8be19b5..5a1de7d4 100644 --- a/goose/utils/__init__.py +++ b/goose/utils/__init__.py @@ -105,19 +105,6 @@ def get_parsing_candidate(self, url_to_crawl): return ParsingCandidate(final_url, link_hash) -class StringSplitter(object): - """\ - - """ - def __init__(self, pattern): - self.pattern = re.compile(pattern) - - def split(self, string): - if not string: - return [] - return self.pattern.split(string) - - class StringReplacement(object): def __init__(self, pattern, replaceWith): From 12dfda500248e093b20c7faa6f1c25be67e1ef8e Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 03:07:45 +0100 Subject: [PATCH 071/100] #188 - move links extraction to LinksExtractor class --- goose/crawler.py | 9 ++++++++- goose/extractors/content.py | 9 --------- goose/extractors/links.py | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 10 deletions(-) create mode 100644 goose/extractors/links.py diff --git a/goose/crawler.py b/goose/crawler.py index e059e902..8482318f 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -29,6 +29,7 @@ from goose.extractors.videos import VideoExtractor from goose.extractors.title import TitleExtractor from goose.extractors.images import ImageExtractor +from goose.extractors.links import LinksExtractor from goose.cleaners import StandardDocumentCleaner from goose.outputformatters import StandardOutputFormatter @@ -65,6 +66,9 @@ def __init__(self, config): # init the output formatter self.formatter = self.get_formatter() + # links extractor + self.links_extractor = self.get_links_extractor() + # video extractor self.video_extractor = self.get_video_extractor() @@ -131,7 +135,7 @@ def crawl(self, crawl_candidate): if self.article.top_node is not None: # article links - self.article.links = self.extractor.extract_links() + self.article.links = self.links_extractor.extract() # tweets self.article.tweets = self.extractor.extract_tweets() @@ -179,6 +183,9 @@ def get_html(self, crawl_candidate, parsing_candidate): }) return html + def get_links_extractor(self): + return LinksExtractor(self.config, self.article) + def get_title_extractor(self): return TitleExtractor(self.config, self.article) diff --git a/goose/extractors/content.py b/goose/extractors/content.py index c580e844..74fe230a 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -192,15 +192,6 @@ def extract_opengraph(self): opengraph_dict.update({attr.split(":")[1]: value}) return opengraph_dict - def extract_links(self): - links = [] - items = self.parser.getElementsByTag(self.article.top_node, 'a') - for i in items: - attr = self.parser.getAttribute(i, 'href') - if attr: - links.append(attr) - return links - def extract_tweets(self): tweets = [] items = self.parser.getElementsByTag( diff --git a/goose/extractors/links.py b/goose/extractors/links.py new file mode 100644 index 00000000..6ba668fe --- /dev/null +++ b/goose/extractors/links.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from goose.extractors import BaseExtractor + + +class LinksExtractor(BaseExtractor): + + def extract(self): + links = [] + items = self.parser.getElementsByTag(self.article.top_node, 'a') + for i in items: + attr = self.parser.getAttribute(i, 'href') + if attr: + links.append(attr) + return links From 2608e43591c0b3eeff7477e7dafe1643356607bb Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 03:11:04 +0100 Subject: [PATCH 072/100] #188 - move tweet extraction to TweetExtractor class --- goose/crawler.py | 9 +++++++- goose/extractors/content.py | 15 ------------- goose/extractors/tweets.py | 42 +++++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 16 deletions(-) create mode 100644 goose/extractors/tweets.py diff --git a/goose/crawler.py b/goose/crawler.py index 8482318f..297b4ff8 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -30,6 +30,7 @@ from goose.extractors.title import TitleExtractor from goose.extractors.images import ImageExtractor from goose.extractors.links import LinksExtractor +from goose.extractors.tweets import TweetsExtractor from goose.cleaners import StandardDocumentCleaner from goose.outputformatters import StandardOutputFormatter @@ -66,6 +67,9 @@ def __init__(self, config): # init the output formatter self.formatter = self.get_formatter() + # tweets extractor + self.tweets_extractor = self.get_tweets_extractor() + # links extractor self.links_extractor = self.get_links_extractor() @@ -138,7 +142,7 @@ def crawl(self, crawl_candidate): self.article.links = self.links_extractor.extract() # tweets - self.article.tweets = self.extractor.extract_tweets() + self.article.tweets = self.tweets_extractor.extract() # video handling self.video_extractor.get_videos() @@ -183,6 +187,9 @@ def get_html(self, crawl_candidate, parsing_candidate): }) return html + def get_tweets_extractor(self): + return TweetsExtractor(self.config, self.article) + def get_links_extractor(self): return LinksExtractor(self.config, self.article) diff --git a/goose/extractors/content.py b/goose/extractors/content.py index 74fe230a..d0442b97 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -192,21 +192,6 @@ def extract_opengraph(self): opengraph_dict.update({attr.split(":")[1]: value}) return opengraph_dict - def extract_tweets(self): - tweets = [] - items = self.parser.getElementsByTag( - self.article.top_node, - tag='blockquote', - attr="class", - value="twitter-tweet") - - for i in items: - for attr in ['gravityScore', 'gravityNodes']: - self.parser.delAttribute(i, attr) - tweets.append(self.parser.nodeToString(i)) - - return tweets - def extract_authors(self): authors = [] author_nodes = self.parser.getElementsByTag( diff --git a/goose/extractors/tweets.py b/goose/extractors/tweets.py new file mode 100644 index 00000000..3c17ad8d --- /dev/null +++ b/goose/extractors/tweets.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from goose.extractors import BaseExtractor + + +class TweetsExtractor(BaseExtractor): + + def extract(self): + tweets = [] + items = self.parser.getElementsByTag( + self.article.top_node, + tag='blockquote', + attr="class", + value="twitter-tweet") + + for i in items: + for attr in ['gravityScore', 'gravityNodes']: + self.parser.delAttribute(i, attr) + tweets.append(self.parser.nodeToString(i)) + + return tweets From 4de0f4b0ed54c857bde89af131606f7a30b34846 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 03:13:46 +0100 Subject: [PATCH 073/100] #188 - move authors extraction to AuthorsExtractor class --- goose/crawler.py | 13 ++++++++++++- goose/extractors/author.py | 22 ++++++++++++++++++++-- goose/extractors/content.py | 19 ------------------- 3 files changed, 32 insertions(+), 22 deletions(-) diff --git a/goose/crawler.py b/goose/crawler.py index 297b4ff8..53f56283 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -31,6 +31,7 @@ from goose.extractors.images import ImageExtractor from goose.extractors.links import LinksExtractor from goose.extractors.tweets import TweetsExtractor +from goose.extractors.authors import AuthorsExtractor from goose.cleaners import StandardDocumentCleaner from goose.outputformatters import StandardOutputFormatter @@ -67,6 +68,9 @@ def __init__(self, config): # init the output formatter self.formatter = self.get_formatter() + # authors extractor + self.authors_extractor = self.get_authors_extractor() + # tweets extractor self.tweets_extractor = self.get_tweets_extractor() @@ -118,7 +122,11 @@ def crawl(self, crawl_candidate): self.article.canonical_link = self.extractor.get_canonical_link() self.article.domain = self.extractor.get_domain() self.article.tags = self.extractor.extract_tags() - self.article.authors = self.extractor.extract_authors() + + # authors + self.article.authors = self.authors_extractor.extract() + + # title self.article.title = self.title_extractor.extract() # check for known node as content body @@ -187,6 +195,9 @@ def get_html(self, crawl_candidate, parsing_candidate): }) return html + def get_authors_extractor(self): + return AuthorsExtractor(self.config, self.article) + def get_tweets_extractor(self): return TweetsExtractor(self.config, self.article) diff --git a/goose/extractors/author.py b/goose/extractors/author.py index bc18925a..414f4eea 100644 --- a/goose/extractors/author.py +++ b/goose/extractors/author.py @@ -24,5 +24,23 @@ from goose.extractors import BaseExtractor -class ContentAuthorExtractor(BaseExtractor): - pass +class AuthorsExtractor(BaseExtractor): + + def extract(self): + authors = [] + author_nodes = self.parser.getElementsByTag( + self.article.doc, + attr='itemprop', + value='author') + + for author in author_nodes: + name_nodes = self.parser.getElementsByTag( + author, + attr='itemprop', + value='name') + + if len(name_nodes) > 0: + name = self.parser.getText(name_nodes[0]) + authors.append(name) + + return list(set(authors)) diff --git a/goose/extractors/content.py b/goose/extractors/content.py index d0442b97..4b4e894a 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -192,25 +192,6 @@ def extract_opengraph(self): opengraph_dict.update({attr.split(":")[1]: value}) return opengraph_dict - def extract_authors(self): - authors = [] - author_nodes = self.parser.getElementsByTag( - self.article.doc, - attr='itemprop', - value='author') - - for author in author_nodes: - name_nodes = self.parser.getElementsByTag( - author, - attr='itemprop', - value='name') - - if len(name_nodes) > 0: - name = self.parser.getText(name_nodes[0]) - authors.append(name) - - return list(set(authors)) - def extract_tags(self): node = self.article.doc From cd4cc7e3beab0dc349a2cada3ee5137b65fc8b16 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 03:14:16 +0100 Subject: [PATCH 074/100] #188 - renamve authors class file --- goose/extractors/{author.py => authors.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename goose/extractors/{author.py => authors.py} (100%) diff --git a/goose/extractors/author.py b/goose/extractors/authors.py similarity index 100% rename from goose/extractors/author.py rename to goose/extractors/authors.py From 8eb74d8e404c8158c98fead93c40a87e77e513a9 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 03:19:37 +0100 Subject: [PATCH 075/100] #188 - move tags extraction to TagsExtractor class --- goose/crawler.py | 11 ++++++++++- goose/extractors/content.py | 24 ------------------------ goose/extractors/tags.py | 27 +++++++++++++++++++++++++-- 3 files changed, 35 insertions(+), 27 deletions(-) diff --git a/goose/crawler.py b/goose/crawler.py index 53f56283..048dc83c 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -32,6 +32,7 @@ from goose.extractors.links import LinksExtractor from goose.extractors.tweets import TweetsExtractor from goose.extractors.authors import AuthorsExtractor +from goose.extractors.tags import TagsExtractor from goose.cleaners import StandardDocumentCleaner from goose.outputformatters import StandardOutputFormatter @@ -68,6 +69,9 @@ def __init__(self, config): # init the output formatter self.formatter = self.get_formatter() + # tags extractor + self.tags_extractor = self.get_tags_extractor() + # authors extractor self.authors_extractor = self.get_authors_extractor() @@ -121,7 +125,9 @@ def crawl(self, crawl_candidate): self.article.meta_keywords = self.extractor.get_meta_keywords() self.article.canonical_link = self.extractor.get_canonical_link() self.article.domain = self.extractor.get_domain() - self.article.tags = self.extractor.extract_tags() + + # tags + self.article.tags = self.tags_extractor.extract() # authors self.article.authors = self.authors_extractor.extract() @@ -195,6 +201,9 @@ def get_html(self, crawl_candidate, parsing_candidate): }) return html + def get_tags_extractor(self): + return TagsExtractor(self.config, self.article) + def get_authors_extractor(self): return AuthorsExtractor(self.config, self.article) diff --git a/goose/extractors/content.py b/goose/extractors/content.py index 4b4e894a..db88a406 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -27,9 +27,6 @@ from goose.extractors import BaseExtractor -NO_STRINGS = [] -A_REL_TAG_SELECTOR = "a[rel=tag]" -A_HREF_TAG_SELECTOR = "a[href*='/tag/'], a[href*='/tags/'], a[href*='/topic/'], a[href*='?keyword=']" RE_LANG = r'^[A-Za-z]{2}$' KNOWN_PUBLISH_DATE_TAGS = [ @@ -192,27 +189,6 @@ def extract_opengraph(self): opengraph_dict.update({attr.split(":")[1]: value}) return opengraph_dict - def extract_tags(self): - node = self.article.doc - - # node doesn't have chidren - if len(list(node)) == 0: - return NO_STRINGS - - elements = self.parser.css_select(node, A_REL_TAG_SELECTOR) - if not elements: - elements = self.parser.css_select(node, A_HREF_TAG_SELECTOR) - if not elements: - return NO_STRINGS - - tags = [] - for el in elements: - tag = self.parser.getText(el) - if tag: - tags.append(tag) - - return list(set(tags)) - def calculate_best_node(self): doc = self.article.doc diff --git a/goose/extractors/tags.py b/goose/extractors/tags.py index 28f835ef..466e7f81 100644 --- a/goose/extractors/tags.py +++ b/goose/extractors/tags.py @@ -23,6 +23,29 @@ from goose.extractors import BaseExtractor +A_REL_TAG_SELECTOR = "a[rel=tag]" +A_HREF_TAG_SELECTOR = "a[href*='/tag/'], a[href*='/tags/'], a[href*='/topic/'], a[href*='?keyword=']" -class ContentTagsExtractor(BaseExtractor): - pass + +class TagsExtractor(BaseExtractor): + + def extract(self): + node = self.article.doc + tags = [] + + # node doesn't have chidren + if len(list(node)) == 0: + return tags + + elements = self.parser.css_select(node, A_REL_TAG_SELECTOR) + if not elements: + elements = self.parser.css_select(node, A_HREF_TAG_SELECTOR) + if not elements: + return tags + + for el in elements: + tag = self.parser.getText(el) + if tag: + tags.append(tag) + + return list(set(tags)) From 1cb9ed44aeeab1619c14d0859dbfd5ab2f80c3b6 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 03:23:10 +0100 Subject: [PATCH 076/100] #188 - move opengraph extraction to OpenGraphExtractor class --- goose/crawler.py | 12 +++++++++++- goose/extractors/content.py | 11 ----------- goose/extractors/opengraph.py | 14 ++++++++++++-- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/goose/crawler.py b/goose/crawler.py index 048dc83c..416e93a8 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -33,6 +33,7 @@ from goose.extractors.tweets import TweetsExtractor from goose.extractors.authors import AuthorsExtractor from goose.extractors.tags import TagsExtractor +from goose.extractors.opengraph import OpenGraphExtractor from goose.cleaners import StandardDocumentCleaner from goose.outputformatters import StandardOutputFormatter @@ -69,6 +70,9 @@ def __init__(self, config): # init the output formatter self.formatter = self.get_formatter() + # opengraph_ extractor + self.opengraph_extractor = self.get_opengraph_extractor() + # tags extractor self.tags_extractor = self.get_tags_extractor() @@ -116,7 +120,10 @@ def crawl(self, crawl_candidate): self.article.raw_html = raw_html self.article.doc = doc self.article.raw_doc = deepcopy(doc) - self.article.opengraph = self.extractor.extract_opengraph() + + # open graph + self.article.opengraph = self.opengraph_extractor.extract() + self.article.publish_date = self.extractor.get_publish_date() # self.article.additional_data = config.get_additionaldata_extractor.extract(doc) self.article.meta_lang = self.extractor.get_meta_lang() @@ -201,6 +208,9 @@ def get_html(self, crawl_candidate, parsing_candidate): }) return html + def get_opengraph_extractor(self): + return OpenGraphExtractor(self.config, self.article) + def get_tags_extractor(self): return TagsExtractor(self.config, self.article) diff --git a/goose/extractors/content.py b/goose/extractors/content.py index db88a406..832cbc21 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -178,17 +178,6 @@ def is_articlebody(self, node): return False - def extract_opengraph(self): - opengraph_dict = {} - node = self.article.doc - metas = self.parser.getElementsByTag(node, 'meta') - for meta in metas: - attr = self.parser.getAttribute(meta, 'property') - if attr is not None and attr.startswith("og:"): - value = self.parser.getAttribute(meta, 'content') - opengraph_dict.update({attr.split(":")[1]: value}) - return opengraph_dict - def calculate_best_node(self): doc = self.article.doc diff --git a/goose/extractors/opengraph.py b/goose/extractors/opengraph.py index ee916b82..a52ac349 100644 --- a/goose/extractors/opengraph.py +++ b/goose/extractors/opengraph.py @@ -24,5 +24,15 @@ from goose.extractors import BaseExtractor -class ContentOpenGraphExtractor(BaseExtractor): - pass +class OpenGraphExtractor(BaseExtractor): + + def extract(self): + opengraph_dict = {} + node = self.article.doc + metas = self.parser.getElementsByTag(node, 'meta') + for meta in metas: + attr = self.parser.getAttribute(meta, 'property') + if attr is not None and attr.startswith("og:"): + value = self.parser.getAttribute(meta, 'content') + opengraph_dict.update({attr.split(":")[1]: value}) + return opengraph_dict From 8320262dbcb8bdad9c65e6dac08903eea140ddec Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 03:32:12 +0100 Subject: [PATCH 077/100] #188 - move publishdate extraction to PublishDateExtractor class --- goose/crawler.py | 13 +++++++++++-- goose/extractors/content.py | 15 --------------- goose/extractors/publishdate.py | 22 ++++++++++++++++++++-- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/goose/crawler.py b/goose/crawler.py index 416e93a8..ea5a5221 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -34,6 +34,7 @@ from goose.extractors.authors import AuthorsExtractor from goose.extractors.tags import TagsExtractor from goose.extractors.opengraph import OpenGraphExtractor +from goose.extractors.publishdate import PublishDateExtractor from goose.cleaners import StandardDocumentCleaner from goose.outputformatters import StandardOutputFormatter @@ -70,7 +71,10 @@ def __init__(self, config): # init the output formatter self.formatter = self.get_formatter() - # opengraph_ extractor + # publishdate extractor + self.publishdate_extractor = self.get_publishdate_extractor() + + # opengraph extractor self.opengraph_extractor = self.get_opengraph_extractor() # tags extractor @@ -124,7 +128,9 @@ def crawl(self, crawl_candidate): # open graph self.article.opengraph = self.opengraph_extractor.extract() - self.article.publish_date = self.extractor.get_publish_date() + # publishdate + self.article.publish_date = self.publishdate_extractor.extract() + # self.article.additional_data = config.get_additionaldata_extractor.extract(doc) self.article.meta_lang = self.extractor.get_meta_lang() self.article.meta_favicon = self.extractor.get_favicon() @@ -208,6 +214,9 @@ def get_html(self, crawl_candidate, parsing_candidate): }) return html + def get_publishdate_extractor(self): + return PublishDateExtractor(self.config, self.article) + def get_opengraph_extractor(self): return OpenGraphExtractor(self.config, self.article) diff --git a/goose/extractors/content.py b/goose/extractors/content.py index 832cbc21..3ca40a82 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -29,13 +29,6 @@ RE_LANG = r'^[A-Za-z]{2}$' -KNOWN_PUBLISH_DATE_TAGS = [ - {'attribute': 'property', 'value': 'rnews:datePublished', 'content': 'content'}, - {'attribute': 'property', 'value': 'article:published_time', 'content': 'content'}, - {'attribute': 'name', 'value': 'OriginalPublicationDate', 'content': 'content'}, - {'attribute': 'itemprop', 'value': 'datePublished', 'content': 'datetime'}, -] - KNOWN_ARTICLE_CONTENT_TAGS = [ {'attr': 'itemprop', 'value': 'articleBody'}, {'attr': 'class', 'value': 'post-content'}, @@ -57,14 +50,6 @@ def get_language(self): return self.article.meta_lang[:2] return self.config.target_language - def get_publish_date(self): - for known_meta_tag in KNOWN_PUBLISH_DATE_TAGS: - meta_tags = self.parser.getElementsByTag(self.article.doc, - attr=known_meta_tag['attribute'], - value=known_meta_tag['value']) - if meta_tags: - return self.parser.getAttribute(meta_tags[0], known_meta_tag['content']) - def get_favicon(self): """\ Extract the favicon from a website diff --git a/goose/extractors/publishdate.py b/goose/extractors/publishdate.py index 7ea1635a..1768b1a0 100644 --- a/goose/extractors/publishdate.py +++ b/goose/extractors/publishdate.py @@ -23,6 +23,24 @@ from goose.extractors import BaseExtractor +KNOWN_PUBLISH_DATE_TAGS = [ + {'attribute': 'property', 'value': 'rnews:datePublished', 'content': 'content'}, + {'attribute': 'property', 'value': 'article:published_time', 'content': 'content'}, + {'attribute': 'name', 'value': 'OriginalPublicationDate', 'content': 'content'}, + {'attribute': 'itemprop', 'value': 'datePublished', 'content': 'datetime'}, +] -class ContentPublishDateExtractor(BaseExtractor): - pass + +class PublishDateExtractor(BaseExtractor): + def extract(self): + for known_meta_tag in KNOWN_PUBLISH_DATE_TAGS: + meta_tags = self.parser.getElementsByTag( + self.article.doc, + attr=known_meta_tag['attribute'], + value=known_meta_tag['value']) + if meta_tags: + return self.parser.getAttribute( + meta_tags[0], + known_meta_tag['content'] + ) + return None From 08fd6b975147fc86018f07346c09de19e682ca55 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 05:07:38 +0100 Subject: [PATCH 078/100] #188 - move meta extraction to MetasExtractor class --- goose/crawler.py | 22 +++++--- goose/extractors/content.py | 89 ------------------------------ goose/extractors/meta.py | 104 +++++++++++++++++++++++++++++++++++- 3 files changed, 118 insertions(+), 97 deletions(-) diff --git a/goose/crawler.py b/goose/crawler.py index ea5a5221..fd577405 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -35,6 +35,7 @@ from goose.extractors.tags import TagsExtractor from goose.extractors.opengraph import OpenGraphExtractor from goose.extractors.publishdate import PublishDateExtractor +from goose.extractors.metas import MetasExtractor from goose.cleaners import StandardDocumentCleaner from goose.outputformatters import StandardOutputFormatter @@ -71,6 +72,9 @@ def __init__(self, config): # init the output formatter self.formatter = self.get_formatter() + # metas extractor + self.metas_extractor = self.get_metas_extractor() + # publishdate extractor self.publishdate_extractor = self.get_publishdate_extractor() @@ -131,12 +135,15 @@ def crawl(self, crawl_candidate): # publishdate self.article.publish_date = self.publishdate_extractor.extract() - # self.article.additional_data = config.get_additionaldata_extractor.extract(doc) - self.article.meta_lang = self.extractor.get_meta_lang() - self.article.meta_favicon = self.extractor.get_favicon() - self.article.meta_description = self.extractor.get_meta_description() - self.article.meta_keywords = self.extractor.get_meta_keywords() - self.article.canonical_link = self.extractor.get_canonical_link() + # meta + metas = self.metas_extractor.extract() + self.article.meta_lang = metas['lang'] + self.article.meta_favicon = metas['favicon'] + self.article.meta_description = metas['description'] + self.article.meta_keywords = metas['keywords'] + self.article.canonical_link = metas['canonical'] + + # domain self.article.domain = self.extractor.get_domain() # tags @@ -214,6 +221,9 @@ def get_html(self, crawl_candidate, parsing_candidate): }) return html + def get_metas_extractor(self): + return MetasExtractor(self.config, self.article) + def get_publishdate_extractor(self): return PublishDateExtractor(self.config, self.article) diff --git a/goose/extractors/content.py b/goose/extractors/content.py index 3ca40a82..557840f4 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -20,14 +20,10 @@ See the License for the specific language governing permissions and limitations under the License. """ -import re - from copy import deepcopy -from urlparse import urlparse, urljoin from goose.extractors import BaseExtractor -RE_LANG = r'^[A-Za-z]{2}$' KNOWN_ARTICLE_CONTENT_TAGS = [ {'attr': 'itemprop', 'value': 'articleBody'}, @@ -50,91 +46,6 @@ def get_language(self): return self.article.meta_lang[:2] return self.config.target_language - def get_favicon(self): - """\ - Extract the favicon from a website - http://en.wikipedia.org/wiki/Favicon - - - """ - kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'icon'} - meta = self.parser.getElementsByTag(self.article.doc, **kwargs) - if meta: - favicon = self.parser.getAttribute(meta[0], 'href') - return favicon - return '' - - def get_meta_lang(self): - """\ - Extract content language from meta - """ - # we have a lang attribute in html - attr = self.parser.getAttribute(self.article.doc, attr='lang') - if attr is None: - # look up for a Content-Language in meta - items = [ - {'tag': 'meta', 'attr': 'http-equiv', 'value': 'content-language'}, - {'tag': 'meta', 'attr': 'name', 'value': 'lang'} - ] - for item in items: - meta = self.parser.getElementsByTag(self.article.doc, **item) - if meta: - attr = self.parser.getAttribute(meta[0], attr='content') - break - - if attr: - value = attr[:2] - if re.search(RE_LANG, value): - return value.lower() - - return None - - def get_meta_content(self, doc, metaName): - """\ - Extract a given meta content form document - """ - meta = self.parser.css_select(doc, metaName) - content = None - - if meta is not None and len(meta) > 0: - content = self.parser.getAttribute(meta[0], 'content') - - if content: - return content.strip() - - return '' - - def get_meta_description(self): - """\ - if the article has meta description set in the source, use that - """ - return self.get_meta_content(self.article.doc, "meta[name=description]") - - def get_meta_keywords(self): - """\ - if the article has meta keywords set in the source, use that - """ - return self.get_meta_content(self.article.doc, "meta[name=keywords]") - - def get_canonical_link(self): - """\ - if the article has meta canonical link set in the url - """ - if self.article.final_url: - kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'canonical'} - meta = self.parser.getElementsByTag(self.article.doc, **kwargs) - if meta is not None and len(meta) > 0: - href = self.parser.getAttribute(meta[0], 'href') - if href: - href = href.strip() - o = urlparse(href) - if not o.hostname: - z = urlparse(self.article.final_url) - domain = '%s://%s' % (z.scheme, z.hostname) - href = urljoin(domain, href) - return href - return self.article.final_url - def get_domain(self): if self.article.final_url: o = urlparse(self.article.final_url) diff --git a/goose/extractors/meta.py b/goose/extractors/meta.py index 7a92df21..efde6714 100644 --- a/goose/extractors/meta.py +++ b/goose/extractors/meta.py @@ -21,8 +21,108 @@ limitations under the License. """ +import re +from urlparse import urljoin +from urlparse import urlparse + from goose.extractors import BaseExtractor -class ContentMetaExtractor(BaseExtractor): - pass +RE_LANG = r'^[A-Za-z]{2}$' + + +class MetasExtractor(BaseExtractor): + + def get_favicon(self): + """\ + Extract the favicon from a website + http://en.wikipedia.org/wiki/Favicon + + + """ + kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'icon'} + meta = self.parser.getElementsByTag(self.article.doc, **kwargs) + if meta: + favicon = self.parser.getAttribute(meta[0], 'href') + return favicon + return '' + + def get_canonical_link(self): + """\ + if the article has meta canonical link set in the url + """ + if self.article.final_url: + kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'canonical'} + meta = self.parser.getElementsByTag(self.article.doc, **kwargs) + if meta is not None and len(meta) > 0: + href = self.parser.getAttribute(meta[0], 'href') + if href: + href = href.strip() + o = urlparse(href) + if not o.hostname: + z = urlparse(self.article.final_url) + domain = '%s://%s' % (z.scheme, z.hostname) + href = urljoin(domain, href) + return href + return self.article.final_url + + def get_meta_lang(self): + """\ + Extract content language from meta + """ + # we have a lang attribute in html + attr = self.parser.getAttribute(self.article.doc, attr='lang') + if attr is None: + # look up for a Content-Language in meta + items = [ + {'tag': 'meta', 'attr': 'http-equiv', 'value': 'content-language'}, + {'tag': 'meta', 'attr': 'name', 'value': 'lang'} + ] + for item in items: + meta = self.parser.getElementsByTag(self.article.doc, **item) + if meta: + attr = self.parser.getAttribute(meta[0], attr='content') + break + + if attr: + value = attr[:2] + if re.search(RE_LANG, value): + return value.lower() + + return None + + def get_meta_content(self, metaName): + """\ + Extract a given meta content form document + """ + meta = self.parser.css_select(self.article.doc, metaName) + content = None + + if meta is not None and len(meta) > 0: + content = self.parser.getAttribute(meta[0], 'content') + + if content: + return content.strip() + + return '' + + def get_meta_description(self): + """\ + if the article has meta description set in the source, use that + """ + return self.get_meta_content("meta[name=description]") + + def get_meta_keywords(self): + """\ + if the article has meta keywords set in the source, use that + """ + return self.get_meta_content("meta[name=keywords]") + + def extract(self): + return { + "description": self.get_meta_description(), + "keywords": self.get_meta_keywords(), + "lang": self.get_meta_lang(), + "favicon": self.get_favicon(), + "canonical": self.get_canonical_link() + } From 45843417a7565268557a64afe9601214b3797263 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 05:08:08 +0100 Subject: [PATCH 079/100] #188 - rename meta extractor file --- goose/extractors/{meta.py => metas.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename goose/extractors/{meta.py => metas.py} (100%) diff --git a/goose/extractors/meta.py b/goose/extractors/metas.py similarity index 100% rename from goose/extractors/meta.py rename to goose/extractors/metas.py From 530ab522e422e71e6b456fc22b77c084539f3fe0 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 05:12:41 +0100 Subject: [PATCH 080/100] #188 - move domain extraction to meta extractor --- goose/crawler.py | 4 +--- goose/extractors/content.py | 6 ------ goose/extractors/metas.py | 9 ++++++++- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/goose/crawler.py b/goose/crawler.py index fd577405..34daf048 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -142,9 +142,7 @@ def crawl(self, crawl_candidate): self.article.meta_description = metas['description'] self.article.meta_keywords = metas['keywords'] self.article.canonical_link = metas['canonical'] - - # domain - self.article.domain = self.extractor.get_domain() + self.article.domain = metas['domain'] # tags self.article.tags = self.tags_extractor.extract() diff --git a/goose/extractors/content.py b/goose/extractors/content.py index 557840f4..e0703d55 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -46,12 +46,6 @@ def get_language(self): return self.article.meta_lang[:2] return self.config.target_language - def get_domain(self): - if self.article.final_url: - o = urlparse(self.article.final_url) - return o.hostname - return None - def get_known_article_tags(self): for item in KNOWN_ARTICLE_CONTENT_TAGS: nodes = self.parser.getElementsByTag( diff --git a/goose/extractors/metas.py b/goose/extractors/metas.py index efde6714..95acadd5 100644 --- a/goose/extractors/metas.py +++ b/goose/extractors/metas.py @@ -33,6 +33,12 @@ class MetasExtractor(BaseExtractor): + def get_domain(self): + if self.article.final_url: + o = urlparse(self.article.final_url) + return o.hostname + return None + def get_favicon(self): """\ Extract the favicon from a website @@ -124,5 +130,6 @@ def extract(self): "keywords": self.get_meta_keywords(), "lang": self.get_meta_lang(), "favicon": self.get_favicon(), - "canonical": self.get_canonical_link() + "canonical": self.get_canonical_link(), + "domain": self.get_domain() } From 49f50b00a67d6a2ae8b1896b1592b123b0b0aa01 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 06:17:39 +0100 Subject: [PATCH 081/100] #188 - move test files --- .../{ => content}/test_allnewlyrics1.html | 0 .../{ => content}/test_allnewlyrics1.json | 0 .../{ => content}/test_aolNews.html | 0 .../{ => content}/test_aolNews.json | 0 .../test_articlebody_attribute.html | 0 .../test_articlebody_attribute.json | 0 .../test_articlebody_itemprop.html | 0 .../test_articlebody_itemprop.json | 0 .../{ => content}/test_articlebody_tag.html | 0 .../{ => content}/test_articlebody_tag.json | 0 .../{ => content}/test_author_schema.html | 0 .../{ => content}/test_author_schema.json | 0 .../{ => content}/test_bbc_chinese.html | 0 .../{ => content}/test_bbc_chinese.json | 0 .../{ => content}/test_businessWeek1.html | 0 .../{ => content}/test_businessWeek1.json | 0 .../{ => content}/test_businessWeek2.html | 0 .../{ => content}/test_businessWeek2.json | 0 .../{ => content}/test_businessWeek3.html | 0 .../{ => content}/test_businessWeek3.json | 0 .../{ => content}/test_businessinsider3.html | 0 .../{ => content}/test_businessinsider3.json | 0 .../{ => content}/test_cbslocal.html | 0 .../{ => content}/test_cbslocal.json | 0 .../extractors/{ => content}/test_cnbc1.html | 0 .../extractors/{ => content}/test_cnbc1.json | 0 .../extractors/{ => content}/test_cnet.html | 0 .../extractors/{ => content}/test_cnet.json | 0 .../extractors/{ => content}/test_cnn1.html | 0 .../extractors/{ => content}/test_cnn1.json | 0 .../{ => content}/test_cnn_arabic.html | 0 .../{ => content}/test_cnn_arabic.json | 0 .../{ => content}/test_donga_korean.html | 0 .../{ => content}/test_donga_korean.json | 0 .../{ => content}/test_elmondo1.html | 0 .../{ => content}/test_elmondo1.json | 0 .../extractors/{ => content}/test_elpais.html | 0 .../extractors/{ => content}/test_elpais.json | 0 .../{ => content}/test_engadget.html | 0 .../{ => content}/test_engadget.json | 0 .../extractors/{ => content}/test_espn.html | 0 .../extractors/{ => content}/test_espn.json | 0 .../{ => content}/test_foxNews.html | 0 .../{ => content}/test_foxNews.json | 0 .../{ => content}/test_get_canonical_url.html | 0 .../{ => content}/test_get_canonical_url.json | 0 .../{ => content}/test_gizmodo1.html | 0 .../{ => content}/test_gizmodo1.json | 0 .../{ => content}/test_guardian1.html | 0 .../{ => content}/test_guardian1.json | 0 .../{ => content}/test_huffingtonPost2.html | 0 .../{ => content}/test_huffingtonPost2.json | 0 .../{ => content}/test_issue115.html | 0 .../{ => content}/test_issue115.json | 0 .../{ => content}/test_issue129.html | 0 .../{ => content}/test_issue129.json | 0 .../{ => content}/test_issue24.html | 0 .../{ => content}/test_issue24.json | 0 .../{ => content}/test_issue25.html | 0 .../{ => content}/test_issue25.json | 0 .../{ => content}/test_issue28.html | 0 .../{ => content}/test_issue28.json | 0 .../{ => content}/test_issue32.html | 0 .../{ => content}/test_issue32.json | 0 .../extractors/{ => content}/test_issue4.html | 0 .../extractors/{ => content}/test_issue4.json | 0 .../{ => content}/test_lefigaro.html | 0 .../{ => content}/test_lefigaro.json | 0 .../{ => content}/test_liberation.html | 0 .../{ => content}/test_liberation.json | 0 .../extractors/{ => content}/test_links.html | 0 .../extractors/{ => content}/test_links.json | 0 .../{ => content}/test_marketplace.html | 0 .../{ => content}/test_marketplace.json | 0 .../{ => content}/test_mashable_issue_74.html | 0 .../{ => content}/test_mashable_issue_74.json | 0 .../extractors/{ => content}/test_msn1.html | 0 .../extractors/{ => content}/test_msn1.json | 0 .../{ => content}/test_okaymarketing.html | 0 .../{ => content}/test_okaymarketing.json | 0 .../{ => content}/test_opengraph.html | 0 .../{ => content}/test_opengraph.json | 0 .../{ => content}/test_politico.html | 0 .../{ => content}/test_politico.json | 0 .../{ => content}/test_publish_date.html | 0 .../{ => content}/test_publish_date.json | 0 .../test_publish_date_article.html | 0 .../test_publish_date_article.json | 0 .../test_publish_date_rnews.html | 0 .../test_publish_date_rnews.json | 0 .../test_publish_date_schema.html | 0 .../test_publish_date_schema.json | 0 .../{ => content}/test_tags_abcau.html | 0 .../{ => content}/test_tags_abcau.json | 0 .../{ => content}/test_tags_cnet.html | 0 .../{ => content}/test_tags_cnet.json | 0 .../{ => content}/test_tags_deadline.html | 0 .../{ => content}/test_tags_deadline.json | 0 .../{ => content}/test_tags_kexp.html | 0 .../{ => content}/test_tags_kexp.json | 0 .../{ => content}/test_tags_wnyc.html | 0 .../{ => content}/test_tags_wnyc.json | 0 .../{ => content}/test_techcrunch1.html | 0 .../{ => content}/test_techcrunch1.json | 0 .../test_testHuffingtonPost.html | 0 .../test_testHuffingtonPost.json | 0 .../extractors/{ => content}/test_time.html | 0 .../extractors/{ => content}/test_time.json | 0 .../extractors/{ => content}/test_time2.html | 0 .../extractors/{ => content}/test_time2.json | 0 .../{ => content}/test_title_opengraph.html | 0 .../{ => content}/test_title_opengraph.json | 0 .../extractors/{ => content}/test_tweet.html | 0 .../extractors/{ => content}/test_tweet.json | 0 .../{ => content}/test_usatoday_issue_74.html | 0 .../{ => content}/test_usatoday_issue_74.json | 0 .../extractors/{ => content}/test_yahoo.html | 0 .../extractors/{ => content}/test_yahoo.json | 0 .../{ => extractors}/videos/test_embed.html | 0 .../{ => extractors}/videos/test_embed.json | 0 .../{ => extractors}/videos/test_iframe.html | 0 .../{ => extractors}/videos/test_iframe.json | 0 .../{ => extractors}/videos/test_object.html | 0 .../{ => extractors}/videos/test_object.json | 0 tests/extractors/__init__.py | 0 tests/extractors/base.py | 252 ++++++++++++++++++ .../{extractors.py => extractors/content.py} | 0 tests/{ => extractors}/images.py | 0 tests/{ => extractors}/videos.py | 0 129 files changed, 252 insertions(+) rename tests/data/extractors/{ => content}/test_allnewlyrics1.html (100%) rename tests/data/extractors/{ => content}/test_allnewlyrics1.json (100%) rename tests/data/extractors/{ => content}/test_aolNews.html (100%) rename tests/data/extractors/{ => content}/test_aolNews.json (100%) rename tests/data/extractors/{ => content}/test_articlebody_attribute.html (100%) rename tests/data/extractors/{ => content}/test_articlebody_attribute.json (100%) rename tests/data/extractors/{ => content}/test_articlebody_itemprop.html (100%) rename tests/data/extractors/{ => content}/test_articlebody_itemprop.json (100%) rename tests/data/extractors/{ => content}/test_articlebody_tag.html (100%) rename tests/data/extractors/{ => content}/test_articlebody_tag.json (100%) rename tests/data/extractors/{ => content}/test_author_schema.html (100%) rename tests/data/extractors/{ => content}/test_author_schema.json (100%) rename tests/data/extractors/{ => content}/test_bbc_chinese.html (100%) rename tests/data/extractors/{ => content}/test_bbc_chinese.json (100%) rename tests/data/extractors/{ => content}/test_businessWeek1.html (100%) rename tests/data/extractors/{ => content}/test_businessWeek1.json (100%) rename tests/data/extractors/{ => content}/test_businessWeek2.html (100%) rename tests/data/extractors/{ => content}/test_businessWeek2.json (100%) rename tests/data/extractors/{ => content}/test_businessWeek3.html (100%) rename tests/data/extractors/{ => content}/test_businessWeek3.json (100%) rename tests/data/extractors/{ => content}/test_businessinsider3.html (100%) rename tests/data/extractors/{ => content}/test_businessinsider3.json (100%) rename tests/data/extractors/{ => content}/test_cbslocal.html (100%) rename tests/data/extractors/{ => content}/test_cbslocal.json (100%) rename tests/data/extractors/{ => content}/test_cnbc1.html (100%) rename tests/data/extractors/{ => content}/test_cnbc1.json (100%) rename tests/data/extractors/{ => content}/test_cnet.html (100%) rename tests/data/extractors/{ => content}/test_cnet.json (100%) rename tests/data/extractors/{ => content}/test_cnn1.html (100%) rename tests/data/extractors/{ => content}/test_cnn1.json (100%) rename tests/data/extractors/{ => content}/test_cnn_arabic.html (100%) rename tests/data/extractors/{ => content}/test_cnn_arabic.json (100%) rename tests/data/extractors/{ => content}/test_donga_korean.html (100%) rename tests/data/extractors/{ => content}/test_donga_korean.json (100%) rename tests/data/extractors/{ => content}/test_elmondo1.html (100%) rename tests/data/extractors/{ => content}/test_elmondo1.json (100%) rename tests/data/extractors/{ => content}/test_elpais.html (100%) rename tests/data/extractors/{ => content}/test_elpais.json (100%) rename tests/data/extractors/{ => content}/test_engadget.html (100%) rename tests/data/extractors/{ => content}/test_engadget.json (100%) rename tests/data/extractors/{ => content}/test_espn.html (100%) rename tests/data/extractors/{ => content}/test_espn.json (100%) rename tests/data/extractors/{ => content}/test_foxNews.html (100%) rename tests/data/extractors/{ => content}/test_foxNews.json (100%) rename tests/data/extractors/{ => content}/test_get_canonical_url.html (100%) rename tests/data/extractors/{ => content}/test_get_canonical_url.json (100%) rename tests/data/extractors/{ => content}/test_gizmodo1.html (100%) rename tests/data/extractors/{ => content}/test_gizmodo1.json (100%) rename tests/data/extractors/{ => content}/test_guardian1.html (100%) rename tests/data/extractors/{ => content}/test_guardian1.json (100%) rename tests/data/extractors/{ => content}/test_huffingtonPost2.html (100%) rename tests/data/extractors/{ => content}/test_huffingtonPost2.json (100%) rename tests/data/extractors/{ => content}/test_issue115.html (100%) rename tests/data/extractors/{ => content}/test_issue115.json (100%) rename tests/data/extractors/{ => content}/test_issue129.html (100%) rename tests/data/extractors/{ => content}/test_issue129.json (100%) rename tests/data/extractors/{ => content}/test_issue24.html (100%) rename tests/data/extractors/{ => content}/test_issue24.json (100%) rename tests/data/extractors/{ => content}/test_issue25.html (100%) rename tests/data/extractors/{ => content}/test_issue25.json (100%) rename tests/data/extractors/{ => content}/test_issue28.html (100%) rename tests/data/extractors/{ => content}/test_issue28.json (100%) rename tests/data/extractors/{ => content}/test_issue32.html (100%) rename tests/data/extractors/{ => content}/test_issue32.json (100%) rename tests/data/extractors/{ => content}/test_issue4.html (100%) rename tests/data/extractors/{ => content}/test_issue4.json (100%) rename tests/data/extractors/{ => content}/test_lefigaro.html (100%) rename tests/data/extractors/{ => content}/test_lefigaro.json (100%) rename tests/data/extractors/{ => content}/test_liberation.html (100%) rename tests/data/extractors/{ => content}/test_liberation.json (100%) rename tests/data/extractors/{ => content}/test_links.html (100%) rename tests/data/extractors/{ => content}/test_links.json (100%) rename tests/data/extractors/{ => content}/test_marketplace.html (100%) rename tests/data/extractors/{ => content}/test_marketplace.json (100%) rename tests/data/extractors/{ => content}/test_mashable_issue_74.html (100%) rename tests/data/extractors/{ => content}/test_mashable_issue_74.json (100%) rename tests/data/extractors/{ => content}/test_msn1.html (100%) rename tests/data/extractors/{ => content}/test_msn1.json (100%) rename tests/data/extractors/{ => content}/test_okaymarketing.html (100%) rename tests/data/extractors/{ => content}/test_okaymarketing.json (100%) rename tests/data/extractors/{ => content}/test_opengraph.html (100%) rename tests/data/extractors/{ => content}/test_opengraph.json (100%) rename tests/data/extractors/{ => content}/test_politico.html (100%) rename tests/data/extractors/{ => content}/test_politico.json (100%) rename tests/data/extractors/{ => content}/test_publish_date.html (100%) rename tests/data/extractors/{ => content}/test_publish_date.json (100%) rename tests/data/extractors/{ => content}/test_publish_date_article.html (100%) rename tests/data/extractors/{ => content}/test_publish_date_article.json (100%) rename tests/data/extractors/{ => content}/test_publish_date_rnews.html (100%) rename tests/data/extractors/{ => content}/test_publish_date_rnews.json (100%) rename tests/data/extractors/{ => content}/test_publish_date_schema.html (100%) rename tests/data/extractors/{ => content}/test_publish_date_schema.json (100%) rename tests/data/extractors/{ => content}/test_tags_abcau.html (100%) rename tests/data/extractors/{ => content}/test_tags_abcau.json (100%) rename tests/data/extractors/{ => content}/test_tags_cnet.html (100%) rename tests/data/extractors/{ => content}/test_tags_cnet.json (100%) rename tests/data/extractors/{ => content}/test_tags_deadline.html (100%) rename tests/data/extractors/{ => content}/test_tags_deadline.json (100%) rename tests/data/extractors/{ => content}/test_tags_kexp.html (100%) rename tests/data/extractors/{ => content}/test_tags_kexp.json (100%) rename tests/data/extractors/{ => content}/test_tags_wnyc.html (100%) rename tests/data/extractors/{ => content}/test_tags_wnyc.json (100%) rename tests/data/extractors/{ => content}/test_techcrunch1.html (100%) rename tests/data/extractors/{ => content}/test_techcrunch1.json (100%) rename tests/data/extractors/{ => content}/test_testHuffingtonPost.html (100%) rename tests/data/extractors/{ => content}/test_testHuffingtonPost.json (100%) rename tests/data/extractors/{ => content}/test_time.html (100%) rename tests/data/extractors/{ => content}/test_time.json (100%) rename tests/data/extractors/{ => content}/test_time2.html (100%) rename tests/data/extractors/{ => content}/test_time2.json (100%) rename tests/data/extractors/{ => content}/test_title_opengraph.html (100%) rename tests/data/extractors/{ => content}/test_title_opengraph.json (100%) rename tests/data/extractors/{ => content}/test_tweet.html (100%) rename tests/data/extractors/{ => content}/test_tweet.json (100%) rename tests/data/extractors/{ => content}/test_usatoday_issue_74.html (100%) rename tests/data/extractors/{ => content}/test_usatoday_issue_74.json (100%) rename tests/data/extractors/{ => content}/test_yahoo.html (100%) rename tests/data/extractors/{ => content}/test_yahoo.json (100%) rename tests/data/{ => extractors}/videos/test_embed.html (100%) rename tests/data/{ => extractors}/videos/test_embed.json (100%) rename tests/data/{ => extractors}/videos/test_iframe.html (100%) rename tests/data/{ => extractors}/videos/test_iframe.json (100%) rename tests/data/{ => extractors}/videos/test_object.html (100%) rename tests/data/{ => extractors}/videos/test_object.json (100%) create mode 100644 tests/extractors/__init__.py create mode 100644 tests/extractors/base.py rename tests/{extractors.py => extractors/content.py} (100%) rename tests/{ => extractors}/images.py (100%) rename tests/{ => extractors}/videos.py (100%) diff --git a/tests/data/extractors/test_allnewlyrics1.html b/tests/data/extractors/content/test_allnewlyrics1.html similarity index 100% rename from tests/data/extractors/test_allnewlyrics1.html rename to tests/data/extractors/content/test_allnewlyrics1.html diff --git a/tests/data/extractors/test_allnewlyrics1.json b/tests/data/extractors/content/test_allnewlyrics1.json similarity index 100% rename from tests/data/extractors/test_allnewlyrics1.json rename to tests/data/extractors/content/test_allnewlyrics1.json diff --git a/tests/data/extractors/test_aolNews.html b/tests/data/extractors/content/test_aolNews.html similarity index 100% rename from tests/data/extractors/test_aolNews.html rename to tests/data/extractors/content/test_aolNews.html diff --git a/tests/data/extractors/test_aolNews.json b/tests/data/extractors/content/test_aolNews.json similarity index 100% rename from tests/data/extractors/test_aolNews.json rename to tests/data/extractors/content/test_aolNews.json diff --git a/tests/data/extractors/test_articlebody_attribute.html b/tests/data/extractors/content/test_articlebody_attribute.html similarity index 100% rename from tests/data/extractors/test_articlebody_attribute.html rename to tests/data/extractors/content/test_articlebody_attribute.html diff --git a/tests/data/extractors/test_articlebody_attribute.json b/tests/data/extractors/content/test_articlebody_attribute.json similarity index 100% rename from tests/data/extractors/test_articlebody_attribute.json rename to tests/data/extractors/content/test_articlebody_attribute.json diff --git a/tests/data/extractors/test_articlebody_itemprop.html b/tests/data/extractors/content/test_articlebody_itemprop.html similarity index 100% rename from tests/data/extractors/test_articlebody_itemprop.html rename to tests/data/extractors/content/test_articlebody_itemprop.html diff --git a/tests/data/extractors/test_articlebody_itemprop.json b/tests/data/extractors/content/test_articlebody_itemprop.json similarity index 100% rename from tests/data/extractors/test_articlebody_itemprop.json rename to tests/data/extractors/content/test_articlebody_itemprop.json diff --git a/tests/data/extractors/test_articlebody_tag.html b/tests/data/extractors/content/test_articlebody_tag.html similarity index 100% rename from tests/data/extractors/test_articlebody_tag.html rename to tests/data/extractors/content/test_articlebody_tag.html diff --git a/tests/data/extractors/test_articlebody_tag.json b/tests/data/extractors/content/test_articlebody_tag.json similarity index 100% rename from tests/data/extractors/test_articlebody_tag.json rename to tests/data/extractors/content/test_articlebody_tag.json diff --git a/tests/data/extractors/test_author_schema.html b/tests/data/extractors/content/test_author_schema.html similarity index 100% rename from tests/data/extractors/test_author_schema.html rename to tests/data/extractors/content/test_author_schema.html diff --git a/tests/data/extractors/test_author_schema.json b/tests/data/extractors/content/test_author_schema.json similarity index 100% rename from tests/data/extractors/test_author_schema.json rename to tests/data/extractors/content/test_author_schema.json diff --git a/tests/data/extractors/test_bbc_chinese.html b/tests/data/extractors/content/test_bbc_chinese.html similarity index 100% rename from tests/data/extractors/test_bbc_chinese.html rename to tests/data/extractors/content/test_bbc_chinese.html diff --git a/tests/data/extractors/test_bbc_chinese.json b/tests/data/extractors/content/test_bbc_chinese.json similarity index 100% rename from tests/data/extractors/test_bbc_chinese.json rename to tests/data/extractors/content/test_bbc_chinese.json diff --git a/tests/data/extractors/test_businessWeek1.html b/tests/data/extractors/content/test_businessWeek1.html similarity index 100% rename from tests/data/extractors/test_businessWeek1.html rename to tests/data/extractors/content/test_businessWeek1.html diff --git a/tests/data/extractors/test_businessWeek1.json b/tests/data/extractors/content/test_businessWeek1.json similarity index 100% rename from tests/data/extractors/test_businessWeek1.json rename to tests/data/extractors/content/test_businessWeek1.json diff --git a/tests/data/extractors/test_businessWeek2.html b/tests/data/extractors/content/test_businessWeek2.html similarity index 100% rename from tests/data/extractors/test_businessWeek2.html rename to tests/data/extractors/content/test_businessWeek2.html diff --git a/tests/data/extractors/test_businessWeek2.json b/tests/data/extractors/content/test_businessWeek2.json similarity index 100% rename from tests/data/extractors/test_businessWeek2.json rename to tests/data/extractors/content/test_businessWeek2.json diff --git a/tests/data/extractors/test_businessWeek3.html b/tests/data/extractors/content/test_businessWeek3.html similarity index 100% rename from tests/data/extractors/test_businessWeek3.html rename to tests/data/extractors/content/test_businessWeek3.html diff --git a/tests/data/extractors/test_businessWeek3.json b/tests/data/extractors/content/test_businessWeek3.json similarity index 100% rename from tests/data/extractors/test_businessWeek3.json rename to tests/data/extractors/content/test_businessWeek3.json diff --git a/tests/data/extractors/test_businessinsider3.html b/tests/data/extractors/content/test_businessinsider3.html similarity index 100% rename from tests/data/extractors/test_businessinsider3.html rename to tests/data/extractors/content/test_businessinsider3.html diff --git a/tests/data/extractors/test_businessinsider3.json b/tests/data/extractors/content/test_businessinsider3.json similarity index 100% rename from tests/data/extractors/test_businessinsider3.json rename to tests/data/extractors/content/test_businessinsider3.json diff --git a/tests/data/extractors/test_cbslocal.html b/tests/data/extractors/content/test_cbslocal.html similarity index 100% rename from tests/data/extractors/test_cbslocal.html rename to tests/data/extractors/content/test_cbslocal.html diff --git a/tests/data/extractors/test_cbslocal.json b/tests/data/extractors/content/test_cbslocal.json similarity index 100% rename from tests/data/extractors/test_cbslocal.json rename to tests/data/extractors/content/test_cbslocal.json diff --git a/tests/data/extractors/test_cnbc1.html b/tests/data/extractors/content/test_cnbc1.html similarity index 100% rename from tests/data/extractors/test_cnbc1.html rename to tests/data/extractors/content/test_cnbc1.html diff --git a/tests/data/extractors/test_cnbc1.json b/tests/data/extractors/content/test_cnbc1.json similarity index 100% rename from tests/data/extractors/test_cnbc1.json rename to tests/data/extractors/content/test_cnbc1.json diff --git a/tests/data/extractors/test_cnet.html b/tests/data/extractors/content/test_cnet.html similarity index 100% rename from tests/data/extractors/test_cnet.html rename to tests/data/extractors/content/test_cnet.html diff --git a/tests/data/extractors/test_cnet.json b/tests/data/extractors/content/test_cnet.json similarity index 100% rename from tests/data/extractors/test_cnet.json rename to tests/data/extractors/content/test_cnet.json diff --git a/tests/data/extractors/test_cnn1.html b/tests/data/extractors/content/test_cnn1.html similarity index 100% rename from tests/data/extractors/test_cnn1.html rename to tests/data/extractors/content/test_cnn1.html diff --git a/tests/data/extractors/test_cnn1.json b/tests/data/extractors/content/test_cnn1.json similarity index 100% rename from tests/data/extractors/test_cnn1.json rename to tests/data/extractors/content/test_cnn1.json diff --git a/tests/data/extractors/test_cnn_arabic.html b/tests/data/extractors/content/test_cnn_arabic.html similarity index 100% rename from tests/data/extractors/test_cnn_arabic.html rename to tests/data/extractors/content/test_cnn_arabic.html diff --git a/tests/data/extractors/test_cnn_arabic.json b/tests/data/extractors/content/test_cnn_arabic.json similarity index 100% rename from tests/data/extractors/test_cnn_arabic.json rename to tests/data/extractors/content/test_cnn_arabic.json diff --git a/tests/data/extractors/test_donga_korean.html b/tests/data/extractors/content/test_donga_korean.html similarity index 100% rename from tests/data/extractors/test_donga_korean.html rename to tests/data/extractors/content/test_donga_korean.html diff --git a/tests/data/extractors/test_donga_korean.json b/tests/data/extractors/content/test_donga_korean.json similarity index 100% rename from tests/data/extractors/test_donga_korean.json rename to tests/data/extractors/content/test_donga_korean.json diff --git a/tests/data/extractors/test_elmondo1.html b/tests/data/extractors/content/test_elmondo1.html similarity index 100% rename from tests/data/extractors/test_elmondo1.html rename to tests/data/extractors/content/test_elmondo1.html diff --git a/tests/data/extractors/test_elmondo1.json b/tests/data/extractors/content/test_elmondo1.json similarity index 100% rename from tests/data/extractors/test_elmondo1.json rename to tests/data/extractors/content/test_elmondo1.json diff --git a/tests/data/extractors/test_elpais.html b/tests/data/extractors/content/test_elpais.html similarity index 100% rename from tests/data/extractors/test_elpais.html rename to tests/data/extractors/content/test_elpais.html diff --git a/tests/data/extractors/test_elpais.json b/tests/data/extractors/content/test_elpais.json similarity index 100% rename from tests/data/extractors/test_elpais.json rename to tests/data/extractors/content/test_elpais.json diff --git a/tests/data/extractors/test_engadget.html b/tests/data/extractors/content/test_engadget.html similarity index 100% rename from tests/data/extractors/test_engadget.html rename to tests/data/extractors/content/test_engadget.html diff --git a/tests/data/extractors/test_engadget.json b/tests/data/extractors/content/test_engadget.json similarity index 100% rename from tests/data/extractors/test_engadget.json rename to tests/data/extractors/content/test_engadget.json diff --git a/tests/data/extractors/test_espn.html b/tests/data/extractors/content/test_espn.html similarity index 100% rename from tests/data/extractors/test_espn.html rename to tests/data/extractors/content/test_espn.html diff --git a/tests/data/extractors/test_espn.json b/tests/data/extractors/content/test_espn.json similarity index 100% rename from tests/data/extractors/test_espn.json rename to tests/data/extractors/content/test_espn.json diff --git a/tests/data/extractors/test_foxNews.html b/tests/data/extractors/content/test_foxNews.html similarity index 100% rename from tests/data/extractors/test_foxNews.html rename to tests/data/extractors/content/test_foxNews.html diff --git a/tests/data/extractors/test_foxNews.json b/tests/data/extractors/content/test_foxNews.json similarity index 100% rename from tests/data/extractors/test_foxNews.json rename to tests/data/extractors/content/test_foxNews.json diff --git a/tests/data/extractors/test_get_canonical_url.html b/tests/data/extractors/content/test_get_canonical_url.html similarity index 100% rename from tests/data/extractors/test_get_canonical_url.html rename to tests/data/extractors/content/test_get_canonical_url.html diff --git a/tests/data/extractors/test_get_canonical_url.json b/tests/data/extractors/content/test_get_canonical_url.json similarity index 100% rename from tests/data/extractors/test_get_canonical_url.json rename to tests/data/extractors/content/test_get_canonical_url.json diff --git a/tests/data/extractors/test_gizmodo1.html b/tests/data/extractors/content/test_gizmodo1.html similarity index 100% rename from tests/data/extractors/test_gizmodo1.html rename to tests/data/extractors/content/test_gizmodo1.html diff --git a/tests/data/extractors/test_gizmodo1.json b/tests/data/extractors/content/test_gizmodo1.json similarity index 100% rename from tests/data/extractors/test_gizmodo1.json rename to tests/data/extractors/content/test_gizmodo1.json diff --git a/tests/data/extractors/test_guardian1.html b/tests/data/extractors/content/test_guardian1.html similarity index 100% rename from tests/data/extractors/test_guardian1.html rename to tests/data/extractors/content/test_guardian1.html diff --git a/tests/data/extractors/test_guardian1.json b/tests/data/extractors/content/test_guardian1.json similarity index 100% rename from tests/data/extractors/test_guardian1.json rename to tests/data/extractors/content/test_guardian1.json diff --git a/tests/data/extractors/test_huffingtonPost2.html b/tests/data/extractors/content/test_huffingtonPost2.html similarity index 100% rename from tests/data/extractors/test_huffingtonPost2.html rename to tests/data/extractors/content/test_huffingtonPost2.html diff --git a/tests/data/extractors/test_huffingtonPost2.json b/tests/data/extractors/content/test_huffingtonPost2.json similarity index 100% rename from tests/data/extractors/test_huffingtonPost2.json rename to tests/data/extractors/content/test_huffingtonPost2.json diff --git a/tests/data/extractors/test_issue115.html b/tests/data/extractors/content/test_issue115.html similarity index 100% rename from tests/data/extractors/test_issue115.html rename to tests/data/extractors/content/test_issue115.html diff --git a/tests/data/extractors/test_issue115.json b/tests/data/extractors/content/test_issue115.json similarity index 100% rename from tests/data/extractors/test_issue115.json rename to tests/data/extractors/content/test_issue115.json diff --git a/tests/data/extractors/test_issue129.html b/tests/data/extractors/content/test_issue129.html similarity index 100% rename from tests/data/extractors/test_issue129.html rename to tests/data/extractors/content/test_issue129.html diff --git a/tests/data/extractors/test_issue129.json b/tests/data/extractors/content/test_issue129.json similarity index 100% rename from tests/data/extractors/test_issue129.json rename to tests/data/extractors/content/test_issue129.json diff --git a/tests/data/extractors/test_issue24.html b/tests/data/extractors/content/test_issue24.html similarity index 100% rename from tests/data/extractors/test_issue24.html rename to tests/data/extractors/content/test_issue24.html diff --git a/tests/data/extractors/test_issue24.json b/tests/data/extractors/content/test_issue24.json similarity index 100% rename from tests/data/extractors/test_issue24.json rename to tests/data/extractors/content/test_issue24.json diff --git a/tests/data/extractors/test_issue25.html b/tests/data/extractors/content/test_issue25.html similarity index 100% rename from tests/data/extractors/test_issue25.html rename to tests/data/extractors/content/test_issue25.html diff --git a/tests/data/extractors/test_issue25.json b/tests/data/extractors/content/test_issue25.json similarity index 100% rename from tests/data/extractors/test_issue25.json rename to tests/data/extractors/content/test_issue25.json diff --git a/tests/data/extractors/test_issue28.html b/tests/data/extractors/content/test_issue28.html similarity index 100% rename from tests/data/extractors/test_issue28.html rename to tests/data/extractors/content/test_issue28.html diff --git a/tests/data/extractors/test_issue28.json b/tests/data/extractors/content/test_issue28.json similarity index 100% rename from tests/data/extractors/test_issue28.json rename to tests/data/extractors/content/test_issue28.json diff --git a/tests/data/extractors/test_issue32.html b/tests/data/extractors/content/test_issue32.html similarity index 100% rename from tests/data/extractors/test_issue32.html rename to tests/data/extractors/content/test_issue32.html diff --git a/tests/data/extractors/test_issue32.json b/tests/data/extractors/content/test_issue32.json similarity index 100% rename from tests/data/extractors/test_issue32.json rename to tests/data/extractors/content/test_issue32.json diff --git a/tests/data/extractors/test_issue4.html b/tests/data/extractors/content/test_issue4.html similarity index 100% rename from tests/data/extractors/test_issue4.html rename to tests/data/extractors/content/test_issue4.html diff --git a/tests/data/extractors/test_issue4.json b/tests/data/extractors/content/test_issue4.json similarity index 100% rename from tests/data/extractors/test_issue4.json rename to tests/data/extractors/content/test_issue4.json diff --git a/tests/data/extractors/test_lefigaro.html b/tests/data/extractors/content/test_lefigaro.html similarity index 100% rename from tests/data/extractors/test_lefigaro.html rename to tests/data/extractors/content/test_lefigaro.html diff --git a/tests/data/extractors/test_lefigaro.json b/tests/data/extractors/content/test_lefigaro.json similarity index 100% rename from tests/data/extractors/test_lefigaro.json rename to tests/data/extractors/content/test_lefigaro.json diff --git a/tests/data/extractors/test_liberation.html b/tests/data/extractors/content/test_liberation.html similarity index 100% rename from tests/data/extractors/test_liberation.html rename to tests/data/extractors/content/test_liberation.html diff --git a/tests/data/extractors/test_liberation.json b/tests/data/extractors/content/test_liberation.json similarity index 100% rename from tests/data/extractors/test_liberation.json rename to tests/data/extractors/content/test_liberation.json diff --git a/tests/data/extractors/test_links.html b/tests/data/extractors/content/test_links.html similarity index 100% rename from tests/data/extractors/test_links.html rename to tests/data/extractors/content/test_links.html diff --git a/tests/data/extractors/test_links.json b/tests/data/extractors/content/test_links.json similarity index 100% rename from tests/data/extractors/test_links.json rename to tests/data/extractors/content/test_links.json diff --git a/tests/data/extractors/test_marketplace.html b/tests/data/extractors/content/test_marketplace.html similarity index 100% rename from tests/data/extractors/test_marketplace.html rename to tests/data/extractors/content/test_marketplace.html diff --git a/tests/data/extractors/test_marketplace.json b/tests/data/extractors/content/test_marketplace.json similarity index 100% rename from tests/data/extractors/test_marketplace.json rename to tests/data/extractors/content/test_marketplace.json diff --git a/tests/data/extractors/test_mashable_issue_74.html b/tests/data/extractors/content/test_mashable_issue_74.html similarity index 100% rename from tests/data/extractors/test_mashable_issue_74.html rename to tests/data/extractors/content/test_mashable_issue_74.html diff --git a/tests/data/extractors/test_mashable_issue_74.json b/tests/data/extractors/content/test_mashable_issue_74.json similarity index 100% rename from tests/data/extractors/test_mashable_issue_74.json rename to tests/data/extractors/content/test_mashable_issue_74.json diff --git a/tests/data/extractors/test_msn1.html b/tests/data/extractors/content/test_msn1.html similarity index 100% rename from tests/data/extractors/test_msn1.html rename to tests/data/extractors/content/test_msn1.html diff --git a/tests/data/extractors/test_msn1.json b/tests/data/extractors/content/test_msn1.json similarity index 100% rename from tests/data/extractors/test_msn1.json rename to tests/data/extractors/content/test_msn1.json diff --git a/tests/data/extractors/test_okaymarketing.html b/tests/data/extractors/content/test_okaymarketing.html similarity index 100% rename from tests/data/extractors/test_okaymarketing.html rename to tests/data/extractors/content/test_okaymarketing.html diff --git a/tests/data/extractors/test_okaymarketing.json b/tests/data/extractors/content/test_okaymarketing.json similarity index 100% rename from tests/data/extractors/test_okaymarketing.json rename to tests/data/extractors/content/test_okaymarketing.json diff --git a/tests/data/extractors/test_opengraph.html b/tests/data/extractors/content/test_opengraph.html similarity index 100% rename from tests/data/extractors/test_opengraph.html rename to tests/data/extractors/content/test_opengraph.html diff --git a/tests/data/extractors/test_opengraph.json b/tests/data/extractors/content/test_opengraph.json similarity index 100% rename from tests/data/extractors/test_opengraph.json rename to tests/data/extractors/content/test_opengraph.json diff --git a/tests/data/extractors/test_politico.html b/tests/data/extractors/content/test_politico.html similarity index 100% rename from tests/data/extractors/test_politico.html rename to tests/data/extractors/content/test_politico.html diff --git a/tests/data/extractors/test_politico.json b/tests/data/extractors/content/test_politico.json similarity index 100% rename from tests/data/extractors/test_politico.json rename to tests/data/extractors/content/test_politico.json diff --git a/tests/data/extractors/test_publish_date.html b/tests/data/extractors/content/test_publish_date.html similarity index 100% rename from tests/data/extractors/test_publish_date.html rename to tests/data/extractors/content/test_publish_date.html diff --git a/tests/data/extractors/test_publish_date.json b/tests/data/extractors/content/test_publish_date.json similarity index 100% rename from tests/data/extractors/test_publish_date.json rename to tests/data/extractors/content/test_publish_date.json diff --git a/tests/data/extractors/test_publish_date_article.html b/tests/data/extractors/content/test_publish_date_article.html similarity index 100% rename from tests/data/extractors/test_publish_date_article.html rename to tests/data/extractors/content/test_publish_date_article.html diff --git a/tests/data/extractors/test_publish_date_article.json b/tests/data/extractors/content/test_publish_date_article.json similarity index 100% rename from tests/data/extractors/test_publish_date_article.json rename to tests/data/extractors/content/test_publish_date_article.json diff --git a/tests/data/extractors/test_publish_date_rnews.html b/tests/data/extractors/content/test_publish_date_rnews.html similarity index 100% rename from tests/data/extractors/test_publish_date_rnews.html rename to tests/data/extractors/content/test_publish_date_rnews.html diff --git a/tests/data/extractors/test_publish_date_rnews.json b/tests/data/extractors/content/test_publish_date_rnews.json similarity index 100% rename from tests/data/extractors/test_publish_date_rnews.json rename to tests/data/extractors/content/test_publish_date_rnews.json diff --git a/tests/data/extractors/test_publish_date_schema.html b/tests/data/extractors/content/test_publish_date_schema.html similarity index 100% rename from tests/data/extractors/test_publish_date_schema.html rename to tests/data/extractors/content/test_publish_date_schema.html diff --git a/tests/data/extractors/test_publish_date_schema.json b/tests/data/extractors/content/test_publish_date_schema.json similarity index 100% rename from tests/data/extractors/test_publish_date_schema.json rename to tests/data/extractors/content/test_publish_date_schema.json diff --git a/tests/data/extractors/test_tags_abcau.html b/tests/data/extractors/content/test_tags_abcau.html similarity index 100% rename from tests/data/extractors/test_tags_abcau.html rename to tests/data/extractors/content/test_tags_abcau.html diff --git a/tests/data/extractors/test_tags_abcau.json b/tests/data/extractors/content/test_tags_abcau.json similarity index 100% rename from tests/data/extractors/test_tags_abcau.json rename to tests/data/extractors/content/test_tags_abcau.json diff --git a/tests/data/extractors/test_tags_cnet.html b/tests/data/extractors/content/test_tags_cnet.html similarity index 100% rename from tests/data/extractors/test_tags_cnet.html rename to tests/data/extractors/content/test_tags_cnet.html diff --git a/tests/data/extractors/test_tags_cnet.json b/tests/data/extractors/content/test_tags_cnet.json similarity index 100% rename from tests/data/extractors/test_tags_cnet.json rename to tests/data/extractors/content/test_tags_cnet.json diff --git a/tests/data/extractors/test_tags_deadline.html b/tests/data/extractors/content/test_tags_deadline.html similarity index 100% rename from tests/data/extractors/test_tags_deadline.html rename to tests/data/extractors/content/test_tags_deadline.html diff --git a/tests/data/extractors/test_tags_deadline.json b/tests/data/extractors/content/test_tags_deadline.json similarity index 100% rename from tests/data/extractors/test_tags_deadline.json rename to tests/data/extractors/content/test_tags_deadline.json diff --git a/tests/data/extractors/test_tags_kexp.html b/tests/data/extractors/content/test_tags_kexp.html similarity index 100% rename from tests/data/extractors/test_tags_kexp.html rename to tests/data/extractors/content/test_tags_kexp.html diff --git a/tests/data/extractors/test_tags_kexp.json b/tests/data/extractors/content/test_tags_kexp.json similarity index 100% rename from tests/data/extractors/test_tags_kexp.json rename to tests/data/extractors/content/test_tags_kexp.json diff --git a/tests/data/extractors/test_tags_wnyc.html b/tests/data/extractors/content/test_tags_wnyc.html similarity index 100% rename from tests/data/extractors/test_tags_wnyc.html rename to tests/data/extractors/content/test_tags_wnyc.html diff --git a/tests/data/extractors/test_tags_wnyc.json b/tests/data/extractors/content/test_tags_wnyc.json similarity index 100% rename from tests/data/extractors/test_tags_wnyc.json rename to tests/data/extractors/content/test_tags_wnyc.json diff --git a/tests/data/extractors/test_techcrunch1.html b/tests/data/extractors/content/test_techcrunch1.html similarity index 100% rename from tests/data/extractors/test_techcrunch1.html rename to tests/data/extractors/content/test_techcrunch1.html diff --git a/tests/data/extractors/test_techcrunch1.json b/tests/data/extractors/content/test_techcrunch1.json similarity index 100% rename from tests/data/extractors/test_techcrunch1.json rename to tests/data/extractors/content/test_techcrunch1.json diff --git a/tests/data/extractors/test_testHuffingtonPost.html b/tests/data/extractors/content/test_testHuffingtonPost.html similarity index 100% rename from tests/data/extractors/test_testHuffingtonPost.html rename to tests/data/extractors/content/test_testHuffingtonPost.html diff --git a/tests/data/extractors/test_testHuffingtonPost.json b/tests/data/extractors/content/test_testHuffingtonPost.json similarity index 100% rename from tests/data/extractors/test_testHuffingtonPost.json rename to tests/data/extractors/content/test_testHuffingtonPost.json diff --git a/tests/data/extractors/test_time.html b/tests/data/extractors/content/test_time.html similarity index 100% rename from tests/data/extractors/test_time.html rename to tests/data/extractors/content/test_time.html diff --git a/tests/data/extractors/test_time.json b/tests/data/extractors/content/test_time.json similarity index 100% rename from tests/data/extractors/test_time.json rename to tests/data/extractors/content/test_time.json diff --git a/tests/data/extractors/test_time2.html b/tests/data/extractors/content/test_time2.html similarity index 100% rename from tests/data/extractors/test_time2.html rename to tests/data/extractors/content/test_time2.html diff --git a/tests/data/extractors/test_time2.json b/tests/data/extractors/content/test_time2.json similarity index 100% rename from tests/data/extractors/test_time2.json rename to tests/data/extractors/content/test_time2.json diff --git a/tests/data/extractors/test_title_opengraph.html b/tests/data/extractors/content/test_title_opengraph.html similarity index 100% rename from tests/data/extractors/test_title_opengraph.html rename to tests/data/extractors/content/test_title_opengraph.html diff --git a/tests/data/extractors/test_title_opengraph.json b/tests/data/extractors/content/test_title_opengraph.json similarity index 100% rename from tests/data/extractors/test_title_opengraph.json rename to tests/data/extractors/content/test_title_opengraph.json diff --git a/tests/data/extractors/test_tweet.html b/tests/data/extractors/content/test_tweet.html similarity index 100% rename from tests/data/extractors/test_tweet.html rename to tests/data/extractors/content/test_tweet.html diff --git a/tests/data/extractors/test_tweet.json b/tests/data/extractors/content/test_tweet.json similarity index 100% rename from tests/data/extractors/test_tweet.json rename to tests/data/extractors/content/test_tweet.json diff --git a/tests/data/extractors/test_usatoday_issue_74.html b/tests/data/extractors/content/test_usatoday_issue_74.html similarity index 100% rename from tests/data/extractors/test_usatoday_issue_74.html rename to tests/data/extractors/content/test_usatoday_issue_74.html diff --git a/tests/data/extractors/test_usatoday_issue_74.json b/tests/data/extractors/content/test_usatoday_issue_74.json similarity index 100% rename from tests/data/extractors/test_usatoday_issue_74.json rename to tests/data/extractors/content/test_usatoday_issue_74.json diff --git a/tests/data/extractors/test_yahoo.html b/tests/data/extractors/content/test_yahoo.html similarity index 100% rename from tests/data/extractors/test_yahoo.html rename to tests/data/extractors/content/test_yahoo.html diff --git a/tests/data/extractors/test_yahoo.json b/tests/data/extractors/content/test_yahoo.json similarity index 100% rename from tests/data/extractors/test_yahoo.json rename to tests/data/extractors/content/test_yahoo.json diff --git a/tests/data/videos/test_embed.html b/tests/data/extractors/videos/test_embed.html similarity index 100% rename from tests/data/videos/test_embed.html rename to tests/data/extractors/videos/test_embed.html diff --git a/tests/data/videos/test_embed.json b/tests/data/extractors/videos/test_embed.json similarity index 100% rename from tests/data/videos/test_embed.json rename to tests/data/extractors/videos/test_embed.json diff --git a/tests/data/videos/test_iframe.html b/tests/data/extractors/videos/test_iframe.html similarity index 100% rename from tests/data/videos/test_iframe.html rename to tests/data/extractors/videos/test_iframe.html diff --git a/tests/data/videos/test_iframe.json b/tests/data/extractors/videos/test_iframe.json similarity index 100% rename from tests/data/videos/test_iframe.json rename to tests/data/extractors/videos/test_iframe.json diff --git a/tests/data/videos/test_object.html b/tests/data/extractors/videos/test_object.html similarity index 100% rename from tests/data/videos/test_object.html rename to tests/data/extractors/videos/test_object.html diff --git a/tests/data/videos/test_object.json b/tests/data/extractors/videos/test_object.json similarity index 100% rename from tests/data/videos/test_object.json rename to tests/data/extractors/videos/test_object.json diff --git a/tests/extractors/__init__.py b/tests/extractors/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/extractors/base.py b/tests/extractors/base.py new file mode 100644 index 00000000..60990b77 --- /dev/null +++ b/tests/extractors/base.py @@ -0,0 +1,252 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import os +import json +import urllib2 +import unittest +import socket + +from StringIO import StringIO + +from goose import Goose +from goose.utils import FileHelper +from goose.configuration import Configuration + + +CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) + + +# Response +class MockResponse(): + """\ + Base mock response class + """ + code = 200 + msg = "OK" + + def __init__(self, cls): + self.cls = cls + + def content(self): + return "response" + + def response(self, req): + data = self.content(req) + url = req.get_full_url() + resp = urllib2.addinfourl(StringIO(data), data, url) + resp.code = self.code + resp.msg = self.msg + return resp + + +class MockHTTPHandler(urllib2.HTTPHandler, urllib2.HTTPSHandler): + """\ + Mocked HTTPHandler in order to query APIs locally + """ + cls = None + + def https_open(self, req): + return self.http_open(req) + + def http_open(self, req): + r = self.cls.callback(self.cls) + return r.response(req) + + @staticmethod + def patch(cls): + opener = urllib2.build_opener(MockHTTPHandler) + urllib2.install_opener(opener) + # dirty ! + for h in opener.handlers: + if isinstance(h, MockHTTPHandler): + h.cls = cls + return [h for h in opener.handlers if isinstance(h, MockHTTPHandler)][0] + + @staticmethod + def unpatch(): + # urllib2 + urllib2._opener = None + + +class BaseMockTests(unittest.TestCase): + """\ + Base Mock test case + """ + callback = MockResponse + + def setUp(self): + # patch DNS + self.original_getaddrinfo = socket.getaddrinfo + socket.getaddrinfo = self.new_getaddrinfo + MockHTTPHandler.patch(self) + + def tearDown(self): + MockHTTPHandler.unpatch() + # DNS + socket.getaddrinfo = self.original_getaddrinfo + + def new_getaddrinfo(self, *args): + return [(2, 1, 6, '', ('127.0.0.1', 0))] + + def _get_current_testname(self): + return self.id().split('.')[-1:][0] + + +class MockResponseExtractors(MockResponse): + def content(self, req): + current_test = self.cls._get_current_testname() + path = os.path.join( + os.path.dirname(CURRENT_PATH), + "data", + "extractors", + "content", + "%s.html" % current_test) + path = os.path.abspath(path) + content = FileHelper.loadResourceFile(path) + return content + + +class TestExtractionBase(BaseMockTests): + """\ + Extraction test case + """ + callback = MockResponseExtractors + + def getRawHtml(self): + test, suite, module, cls, func = self.id().split('.') + path = os.path.join( + os.path.dirname(CURRENT_PATH), + "data", + suite, + module, + "%s.html" % func) + path = os.path.abspath(path) + content = FileHelper.loadResourceFile(path) + return content + + def loadData(self): + """\ + + """ + test, suite, module, cls, func = self.id().split('.') + path = os.path.join( + os.path.dirname(CURRENT_PATH), + "data", + suite, + module, + "%s.json" % func) + path = os.path.abspath(path) + content = FileHelper.loadResourceFile(path) + self.data = json.loads(content) + + def assert_cleaned_text(self, field, expected_value, result_value): + """\ + + """ + # # TODO : handle verbose level in tests + # print "\n=======================::. ARTICLE REPORT %s .::======================\n" % self.id() + # print 'expected_value (%s) \n' % len(expected_value) + # print expected_value + # print "-------" + # print 'result_value (%s) \n' % len(result_value) + # print result_value + + # cleaned_text is Null + msg = u"Resulting article text was NULL!" + self.assertNotEqual(result_value, None, msg=msg) + + # cleaned_text length + msg = u"Article text was not as long as expected beginning!" + self.assertTrue(len(expected_value) <= len(result_value), msg=msg) + + # clean_text value + result_value = result_value[0:len(expected_value)] + msg = u"The beginning of the article text was not as expected!" + self.assertEqual(expected_value, result_value, msg=msg) + + def assert_tags(self, field, expected_value, result_value): + """\ + + """ + # as we have a set in expected_value and a list in result_value + # make result_value a set + expected_value = set(expected_value) + + # check if both have the same number of items + msg = (u"expected tags set and result tags set" + u"don't have the same number of items") + self.assertEqual(len(result_value), len(expected_value), msg=msg) + + # check if each tag in result_value is in expected_value + for tag in result_value: + self.assertTrue(tag in expected_value) + + def runArticleAssertions(self, article, fields): + """\ + + """ + for field in fields: + expected_value = self.data['expected'][field] + result_value = getattr(article, field, None) + + # custom assertion for a given field + assertion = 'assert_%s' % field + if hasattr(self, assertion): + getattr(self, assertion)(field, expected_value, result_value) + continue + + # default assertion + msg = u"Error %s \nexpected: %s\nresult: %s" % (field, expected_value, result_value) + self.assertEqual(expected_value, result_value, msg=msg) + + def extract(self, instance): + article = instance.extract(url=self.data['url']) + return article + + def getConfig(self): + config = Configuration() + config.enable_image_fetching = False + return config + + def getArticle(self): + """\ + + """ + # load test case data + self.loadData() + + # basic configuration + # no image fetching + config = self.getConfig() + self.parser = config.get_parser() + + # target language + # needed for non english language most of the time + target_language = self.data.get('target_language') + if target_language: + config.target_language = target_language + config.use_meta_language = False + + # run goose + g = Goose(config=config) + return self.extract(g) diff --git a/tests/extractors.py b/tests/extractors/content.py similarity index 100% rename from tests/extractors.py rename to tests/extractors/content.py diff --git a/tests/images.py b/tests/extractors/images.py similarity index 100% rename from tests/images.py rename to tests/extractors/images.py diff --git a/tests/videos.py b/tests/extractors/videos.py similarity index 100% rename from tests/videos.py rename to tests/extractors/videos.py From 6009d44905334b855ba45a037944e81261d894b3 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 06:24:10 +0100 Subject: [PATCH 082/100] #188 - tests refactor --- tests/base.py | 82 ------------------------------------- tests/extractors/content.py | 32 ++++++++------- tests/extractors/images.py | 2 +- tests/extractors/videos.py | 22 ++++------ 4 files changed, 26 insertions(+), 112 deletions(-) diff --git a/tests/base.py b/tests/base.py index d0619ed1..7cc3532c 100644 --- a/tests/base.py +++ b/tests/base.py @@ -20,85 +20,3 @@ See the License for the specific language governing permissions and limitations under the License. """ - -import urllib2 -import unittest -import socket - -from StringIO import StringIO - - -# Response -class MockResponse(): - """\ - Base mock response class - """ - code = 200 - msg = "OK" - - def __init__(self, cls): - self.cls = cls - - def content(self): - return "response" - - def response(self, req): - data = self.content(req) - url = req.get_full_url() - resp = urllib2.addinfourl(StringIO(data), data, url) - resp.code = self.code - resp.msg = self.msg - return resp - - -class MockHTTPHandler(urllib2.HTTPHandler, urllib2.HTTPSHandler): - """\ - Mocked HTTPHandler in order to query APIs locally - """ - cls = None - - def https_open(self, req): - return self.http_open(req) - - def http_open(self, req): - r = self.cls.callback(self.cls) - return r.response(req) - - @staticmethod - def patch(cls): - opener = urllib2.build_opener(MockHTTPHandler) - urllib2.install_opener(opener) - # dirty ! - for h in opener.handlers: - if isinstance(h, MockHTTPHandler): - h.cls = cls - return [h for h in opener.handlers if isinstance(h, MockHTTPHandler)][0] - - @staticmethod - def unpatch(): - # urllib2 - urllib2._opener = None - - -class BaseMockTests(unittest.TestCase): - """\ - Base Mock test case - """ - callback = MockResponse - - def setUp(self): - # patch DNS - self.original_getaddrinfo = socket.getaddrinfo - socket.getaddrinfo = self.new_getaddrinfo - MockHTTPHandler.patch(self) - - def tearDown(self): - MockHTTPHandler.unpatch() - # DNS - socket.getaddrinfo = self.original_getaddrinfo - - def new_getaddrinfo(self, *args): - return [(2, 1, 6, '', ('127.0.0.1', 0))] - - def _get_current_testname(self): - return self.id().split('.')[-1:][0] diff --git a/tests/extractors/content.py b/tests/extractors/content.py index b9496b8c..950d2208 100644 --- a/tests/extractors/content.py +++ b/tests/extractors/content.py @@ -23,28 +23,20 @@ import os import json -from base import BaseMockTests, MockResponse +from base import BaseMockTests +from base import MockResponseExtractors from goose import Goose -from goose.utils import FileHelper from goose.configuration import Configuration from goose.text import StopWordsChinese from goose.text import StopWordsArabic from goose.text import StopWordsKorean +from goose.utils import FileHelper CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) -class MockResponseExtractors(MockResponse): - def content(self, req): - current_test = self.cls._get_current_testname() - path = os.path.join(CURRENT_PATH, "data", "extractors", "%s.html" % current_test) - path = os.path.abspath(path) - content = FileHelper.loadResourceFile(path) - return content - - class TestExtractionBase(BaseMockTests): """\ Extraction test case @@ -52,8 +44,13 @@ class TestExtractionBase(BaseMockTests): callback = MockResponseExtractors def getRawHtml(self): - suite, module, cls, func = self.id().split('.') - path = os.path.join(CURRENT_PATH, "data", module, "%s.html" % func) + test, suite, module, cls, func = self.id().split('.') + path = os.path.join( + os.path.dirname(CURRENT_PATH), + "data", + suite, + module, + "%s.html" % func) path = os.path.abspath(path) content = FileHelper.loadResourceFile(path) return content @@ -62,8 +59,13 @@ def loadData(self): """\ """ - suite, module, cls, func = self.id().split('.') - path = os.path.join(CURRENT_PATH, "data", module, "%s.json" % func) + test, suite, module, cls, func = self.id().split('.') + path = os.path.join( + os.path.dirname(CURRENT_PATH), + "data", + suite, + module, + "%s.json" % func) path = os.path.abspath(path) content = FileHelper.loadResourceFile(path) self.data = json.loads(content) diff --git a/tests/extractors/images.py b/tests/extractors/images.py index ace6d323..582bca9f 100644 --- a/tests/extractors/images.py +++ b/tests/extractors/images.py @@ -26,7 +26,7 @@ import unittest from base import MockResponse -from extractors import TestExtractionBase +from base import TestExtractionBase from goose.configuration import Configuration from goose.image import Image diff --git a/tests/extractors/videos.py b/tests/extractors/videos.py index 4f18d0f1..389a414c 100644 --- a/tests/extractors/videos.py +++ b/tests/extractors/videos.py @@ -21,10 +21,9 @@ limitations under the License. """ import os -import json -from .base import MockResponse -from .extractors import TestExtractionBase +from base import MockResponse +from base import TestExtractionBase from goose.utils import FileHelper @@ -34,7 +33,12 @@ class MockResponseVideos(MockResponse): def content(self, req): current_test = self.cls._get_current_testname() - path = os.path.join(CURRENT_PATH, "data", "videos", "%s.html" % current_test) + path = os.path.join( + os.path.dirname(CURRENT_PATH), + "data", + "extractors", + "videos", + "%s.html" % current_test) path = os.path.abspath(path) content = FileHelper.loadResourceFile(path) return content @@ -59,16 +63,6 @@ def assert_movies(self, field, expected_value, result_value): r = getattr(video, k) self.assertEqual(r, v) - def loadData(self): - """\ - - """ - suite, module, cls, func = self.id().split('.') - path = os.path.join(CURRENT_PATH, "data", module, "%s.json" % func) - path = os.path.abspath(path) - content = FileHelper.loadResourceFile(path) - self.data = json.loads(content) - def test_embed(self): article = self.getArticle() fields = ['movies'] From 26ba835b5aeab2e3f0aeda024192ee6725837005 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 06:32:34 +0100 Subject: [PATCH 083/100] #188 - move image test case --- .../50850547cc7310bc53e30e802c6318f1 | Bin .../test_basic_image/test_basic_image.html | 0 .../test_basic_image/test_basic_image.json | 0 .../test_known_image_css_class.html | 0 .../test_known_image_css_class.json | 0 .../test_known_image_css_id.html | 0 .../test_known_image_css_id.json | 0 .../test_known_image_css_parent_class.html | 0 .../test_known_image_css_parent_class.json | 0 .../test_known_image_css_parent_id.html | 0 .../test_known_image_css_parent_id.json | 0 .../test_known_image_empty_src.html | 0 .../test_known_image_empty_src.json | 0 .../test_known_image_name_parent.html | 0 .../test_known_image_name_parent.json | 0 .../test_opengraph_tag.html | 0 .../test_opengraph_tag.json | 0 tests/extractors/images.py | 29 +++++++++++++++--- 18 files changed, 24 insertions(+), 5 deletions(-) rename tests/data/{ => extractors}/images/test_basic_image/50850547cc7310bc53e30e802c6318f1 (100%) rename tests/data/{ => extractors}/images/test_basic_image/test_basic_image.html (100%) rename tests/data/{ => extractors}/images/test_basic_image/test_basic_image.json (100%) rename tests/data/{ => extractors}/images/test_known_image_css_class/test_known_image_css_class.html (100%) rename tests/data/{ => extractors}/images/test_known_image_css_class/test_known_image_css_class.json (100%) rename tests/data/{ => extractors}/images/test_known_image_css_id/test_known_image_css_id.html (100%) rename tests/data/{ => extractors}/images/test_known_image_css_id/test_known_image_css_id.json (100%) rename tests/data/{ => extractors}/images/test_known_image_css_parent_class/test_known_image_css_parent_class.html (100%) rename tests/data/{ => extractors}/images/test_known_image_css_parent_class/test_known_image_css_parent_class.json (100%) rename tests/data/{ => extractors}/images/test_known_image_css_parent_id/test_known_image_css_parent_id.html (100%) rename tests/data/{ => extractors}/images/test_known_image_css_parent_id/test_known_image_css_parent_id.json (100%) rename tests/data/{ => extractors}/images/test_known_image_empty_src/test_known_image_empty_src.html (100%) rename tests/data/{ => extractors}/images/test_known_image_empty_src/test_known_image_empty_src.json (100%) rename tests/data/{ => extractors}/images/test_known_image_name_parent/test_known_image_name_parent.html (100%) rename tests/data/{ => extractors}/images/test_known_image_name_parent/test_known_image_name_parent.json (100%) rename tests/data/{ => extractors}/images/test_opengraph_tag/test_opengraph_tag.html (100%) rename tests/data/{ => extractors}/images/test_opengraph_tag/test_opengraph_tag.json (100%) diff --git a/tests/data/images/test_basic_image/50850547cc7310bc53e30e802c6318f1 b/tests/data/extractors/images/test_basic_image/50850547cc7310bc53e30e802c6318f1 similarity index 100% rename from tests/data/images/test_basic_image/50850547cc7310bc53e30e802c6318f1 rename to tests/data/extractors/images/test_basic_image/50850547cc7310bc53e30e802c6318f1 diff --git a/tests/data/images/test_basic_image/test_basic_image.html b/tests/data/extractors/images/test_basic_image/test_basic_image.html similarity index 100% rename from tests/data/images/test_basic_image/test_basic_image.html rename to tests/data/extractors/images/test_basic_image/test_basic_image.html diff --git a/tests/data/images/test_basic_image/test_basic_image.json b/tests/data/extractors/images/test_basic_image/test_basic_image.json similarity index 100% rename from tests/data/images/test_basic_image/test_basic_image.json rename to tests/data/extractors/images/test_basic_image/test_basic_image.json diff --git a/tests/data/images/test_known_image_css_class/test_known_image_css_class.html b/tests/data/extractors/images/test_known_image_css_class/test_known_image_css_class.html similarity index 100% rename from tests/data/images/test_known_image_css_class/test_known_image_css_class.html rename to tests/data/extractors/images/test_known_image_css_class/test_known_image_css_class.html diff --git a/tests/data/images/test_known_image_css_class/test_known_image_css_class.json b/tests/data/extractors/images/test_known_image_css_class/test_known_image_css_class.json similarity index 100% rename from tests/data/images/test_known_image_css_class/test_known_image_css_class.json rename to tests/data/extractors/images/test_known_image_css_class/test_known_image_css_class.json diff --git a/tests/data/images/test_known_image_css_id/test_known_image_css_id.html b/tests/data/extractors/images/test_known_image_css_id/test_known_image_css_id.html similarity index 100% rename from tests/data/images/test_known_image_css_id/test_known_image_css_id.html rename to tests/data/extractors/images/test_known_image_css_id/test_known_image_css_id.html diff --git a/tests/data/images/test_known_image_css_id/test_known_image_css_id.json b/tests/data/extractors/images/test_known_image_css_id/test_known_image_css_id.json similarity index 100% rename from tests/data/images/test_known_image_css_id/test_known_image_css_id.json rename to tests/data/extractors/images/test_known_image_css_id/test_known_image_css_id.json diff --git a/tests/data/images/test_known_image_css_parent_class/test_known_image_css_parent_class.html b/tests/data/extractors/images/test_known_image_css_parent_class/test_known_image_css_parent_class.html similarity index 100% rename from tests/data/images/test_known_image_css_parent_class/test_known_image_css_parent_class.html rename to tests/data/extractors/images/test_known_image_css_parent_class/test_known_image_css_parent_class.html diff --git a/tests/data/images/test_known_image_css_parent_class/test_known_image_css_parent_class.json b/tests/data/extractors/images/test_known_image_css_parent_class/test_known_image_css_parent_class.json similarity index 100% rename from tests/data/images/test_known_image_css_parent_class/test_known_image_css_parent_class.json rename to tests/data/extractors/images/test_known_image_css_parent_class/test_known_image_css_parent_class.json diff --git a/tests/data/images/test_known_image_css_parent_id/test_known_image_css_parent_id.html b/tests/data/extractors/images/test_known_image_css_parent_id/test_known_image_css_parent_id.html similarity index 100% rename from tests/data/images/test_known_image_css_parent_id/test_known_image_css_parent_id.html rename to tests/data/extractors/images/test_known_image_css_parent_id/test_known_image_css_parent_id.html diff --git a/tests/data/images/test_known_image_css_parent_id/test_known_image_css_parent_id.json b/tests/data/extractors/images/test_known_image_css_parent_id/test_known_image_css_parent_id.json similarity index 100% rename from tests/data/images/test_known_image_css_parent_id/test_known_image_css_parent_id.json rename to tests/data/extractors/images/test_known_image_css_parent_id/test_known_image_css_parent_id.json diff --git a/tests/data/images/test_known_image_empty_src/test_known_image_empty_src.html b/tests/data/extractors/images/test_known_image_empty_src/test_known_image_empty_src.html similarity index 100% rename from tests/data/images/test_known_image_empty_src/test_known_image_empty_src.html rename to tests/data/extractors/images/test_known_image_empty_src/test_known_image_empty_src.html diff --git a/tests/data/images/test_known_image_empty_src/test_known_image_empty_src.json b/tests/data/extractors/images/test_known_image_empty_src/test_known_image_empty_src.json similarity index 100% rename from tests/data/images/test_known_image_empty_src/test_known_image_empty_src.json rename to tests/data/extractors/images/test_known_image_empty_src/test_known_image_empty_src.json diff --git a/tests/data/images/test_known_image_name_parent/test_known_image_name_parent.html b/tests/data/extractors/images/test_known_image_name_parent/test_known_image_name_parent.html similarity index 100% rename from tests/data/images/test_known_image_name_parent/test_known_image_name_parent.html rename to tests/data/extractors/images/test_known_image_name_parent/test_known_image_name_parent.html diff --git a/tests/data/images/test_known_image_name_parent/test_known_image_name_parent.json b/tests/data/extractors/images/test_known_image_name_parent/test_known_image_name_parent.json similarity index 100% rename from tests/data/images/test_known_image_name_parent/test_known_image_name_parent.json rename to tests/data/extractors/images/test_known_image_name_parent/test_known_image_name_parent.json diff --git a/tests/data/images/test_opengraph_tag/test_opengraph_tag.html b/tests/data/extractors/images/test_opengraph_tag/test_opengraph_tag.html similarity index 100% rename from tests/data/images/test_opengraph_tag/test_opengraph_tag.html rename to tests/data/extractors/images/test_opengraph_tag/test_opengraph_tag.html diff --git a/tests/data/images/test_opengraph_tag/test_opengraph_tag.json b/tests/data/extractors/images/test_opengraph_tag/test_opengraph_tag.json similarity index 100% rename from tests/data/images/test_opengraph_tag/test_opengraph_tag.json rename to tests/data/extractors/images/test_opengraph_tag/test_opengraph_tag.json diff --git a/tests/extractors/images.py b/tests/extractors/images.py index 582bca9f..e47a1dde 100644 --- a/tests/extractors/images.py +++ b/tests/extractors/images.py @@ -43,7 +43,13 @@ class MockResponseImage(MockResponse): def image_content(self, req): md5_hash = hashlib.md5(req.get_full_url()).hexdigest() current_test = self.cls._get_current_testname() - path = os.path.join(CURRENT_PATH, "data", "images", current_test, md5_hash) + path = os.path.join( + os.path.dirname(CURRENT_PATH), + "data", + "extractors", + "images", + current_test, + md5_hash) path = os.path.abspath(path) f = open(path, 'rb') content = f.read() @@ -52,7 +58,13 @@ def image_content(self, req): def html_content(self, req): current_test = self.cls._get_current_testname() - path = os.path.join(CURRENT_PATH, "data", "images", current_test, "%s.html" % current_test) + path = os.path.join( + os.path.dirname(CURRENT_PATH), + "data", + "extractors", + "images", + current_test, + "%s.html" % current_test) path = os.path.abspath(path) return FileHelper.loadResourceFile(path) @@ -72,8 +84,15 @@ def loadData(self): """\ """ - suite, module, cls, func = self.id().split('.') - path = os.path.join(CURRENT_PATH, "data", module, func, "%s.json" % func) + test, suite, module, cls, func = self.id().split('.') + path = os.path.join( + os.path.dirname(CURRENT_PATH), + "data", + suite, + module, + func, + "%s.json" % func) + path = os.path.abspath(path) content = FileHelper.loadResourceFile(path) self.data = json.loads(content) @@ -158,7 +177,7 @@ def test_opengraph_tag(self): class ImageUtilsTests(unittest.TestCase): def setUp(self): - self.path = 'tests/data/images/test_basic_image/50850547cc7310bc53e30e802c6318f1' + self.path = 'tests/data/extractors/images/test_basic_image/50850547cc7310bc53e30e802c6318f1' self.expected_results = { 'width': 476, 'height': 317, From ff4449cc27fa9dc508f4ce7a0c06b7e16bd6656d Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 06:36:14 +0100 Subject: [PATCH 084/100] #188 - remove useless file --- tests/base.py | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 tests/base.py diff --git a/tests/base.py b/tests/base.py deleted file mode 100644 index 7cc3532c..00000000 --- a/tests/base.py +++ /dev/null @@ -1,22 +0,0 @@ -# -*- coding: utf-8 -*- -"""\ -This is a python port of "Goose" orignialy licensed to Gravity.com -under one or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. - -Python port was written by Xavier Grangier for Recrutae - -Gravity.com licenses this file -to you under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance -with the License. You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" From c381993b05688ad63d57306b87e1a4a1acfddbb2 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 06:40:38 +0100 Subject: [PATCH 085/100] #188 - news extractos tests files --- tests/extractors/authors.py | 0 tests/extractors/metas.py | 0 tests/extractors/opengraph.py | 0 tests/extractors/publishdate.py | 0 tests/extractors/tags.py | 0 tests/extractors/title.py | 0 tests/extractors/tweets.py | 0 7 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/extractors/authors.py create mode 100644 tests/extractors/metas.py create mode 100644 tests/extractors/opengraph.py create mode 100644 tests/extractors/publishdate.py create mode 100644 tests/extractors/tags.py create mode 100644 tests/extractors/title.py create mode 100644 tests/extractors/tweets.py diff --git a/tests/extractors/authors.py b/tests/extractors/authors.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/extractors/metas.py b/tests/extractors/metas.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/extractors/opengraph.py b/tests/extractors/opengraph.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/extractors/publishdate.py b/tests/extractors/publishdate.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/extractors/tags.py b/tests/extractors/tags.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/extractors/title.py b/tests/extractors/title.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/extractors/tweets.py b/tests/extractors/tweets.py new file mode 100644 index 00000000..e69de29b From 0e6a7713e25c7a54ec6196b85111713335c0e834 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 06:53:07 +0100 Subject: [PATCH 086/100] #188 - test refactor video image tags publishdate --- .../test_publish_date.html | 0 .../test_publish_date.json | 0 .../test_publish_date_article.html | 0 .../test_publish_date_article.json | 0 .../test_publish_date_rnews.html | 0 .../test_publish_date_rnews.json | 0 .../test_publish_date_schema.html | 0 .../test_publish_date_schema.json | 0 .../{content => tags}/test_tags_abcau.html | 0 .../{content => tags}/test_tags_abcau.json | 0 .../{content => tags}/test_tags_cnet.html | 0 .../{content => tags}/test_tags_cnet.json | 0 .../{content => tags}/test_tags_deadline.html | 0 .../{content => tags}/test_tags_deadline.json | 0 .../{content => tags}/test_tags_kexp.html | 0 .../{content => tags}/test_tags_kexp.json | 0 .../{content => tags}/test_tags_wnyc.html | 0 .../{content => tags}/test_tags_wnyc.json | 0 tests/extractors/base.py | 25 ++----- tests/extractors/content.py | 67 ----------------- tests/extractors/publishdate.py | 43 +++++++++++ tests/extractors/tags.py | 72 +++++++++++++++++++ tests/extractors/videos.py | 16 ----- 23 files changed, 119 insertions(+), 104 deletions(-) rename tests/data/extractors/{content => publishdate}/test_publish_date.html (100%) rename tests/data/extractors/{content => publishdate}/test_publish_date.json (100%) rename tests/data/extractors/{content => publishdate}/test_publish_date_article.html (100%) rename tests/data/extractors/{content => publishdate}/test_publish_date_article.json (100%) rename tests/data/extractors/{content => publishdate}/test_publish_date_rnews.html (100%) rename tests/data/extractors/{content => publishdate}/test_publish_date_rnews.json (100%) rename tests/data/extractors/{content => publishdate}/test_publish_date_schema.html (100%) rename tests/data/extractors/{content => publishdate}/test_publish_date_schema.json (100%) rename tests/data/extractors/{content => tags}/test_tags_abcau.html (100%) rename tests/data/extractors/{content => tags}/test_tags_abcau.json (100%) rename tests/data/extractors/{content => tags}/test_tags_cnet.html (100%) rename tests/data/extractors/{content => tags}/test_tags_cnet.json (100%) rename tests/data/extractors/{content => tags}/test_tags_deadline.html (100%) rename tests/data/extractors/{content => tags}/test_tags_deadline.json (100%) rename tests/data/extractors/{content => tags}/test_tags_kexp.html (100%) rename tests/data/extractors/{content => tags}/test_tags_kexp.json (100%) rename tests/data/extractors/{content => tags}/test_tags_wnyc.html (100%) rename tests/data/extractors/{content => tags}/test_tags_wnyc.json (100%) diff --git a/tests/data/extractors/content/test_publish_date.html b/tests/data/extractors/publishdate/test_publish_date.html similarity index 100% rename from tests/data/extractors/content/test_publish_date.html rename to tests/data/extractors/publishdate/test_publish_date.html diff --git a/tests/data/extractors/content/test_publish_date.json b/tests/data/extractors/publishdate/test_publish_date.json similarity index 100% rename from tests/data/extractors/content/test_publish_date.json rename to tests/data/extractors/publishdate/test_publish_date.json diff --git a/tests/data/extractors/content/test_publish_date_article.html b/tests/data/extractors/publishdate/test_publish_date_article.html similarity index 100% rename from tests/data/extractors/content/test_publish_date_article.html rename to tests/data/extractors/publishdate/test_publish_date_article.html diff --git a/tests/data/extractors/content/test_publish_date_article.json b/tests/data/extractors/publishdate/test_publish_date_article.json similarity index 100% rename from tests/data/extractors/content/test_publish_date_article.json rename to tests/data/extractors/publishdate/test_publish_date_article.json diff --git a/tests/data/extractors/content/test_publish_date_rnews.html b/tests/data/extractors/publishdate/test_publish_date_rnews.html similarity index 100% rename from tests/data/extractors/content/test_publish_date_rnews.html rename to tests/data/extractors/publishdate/test_publish_date_rnews.html diff --git a/tests/data/extractors/content/test_publish_date_rnews.json b/tests/data/extractors/publishdate/test_publish_date_rnews.json similarity index 100% rename from tests/data/extractors/content/test_publish_date_rnews.json rename to tests/data/extractors/publishdate/test_publish_date_rnews.json diff --git a/tests/data/extractors/content/test_publish_date_schema.html b/tests/data/extractors/publishdate/test_publish_date_schema.html similarity index 100% rename from tests/data/extractors/content/test_publish_date_schema.html rename to tests/data/extractors/publishdate/test_publish_date_schema.html diff --git a/tests/data/extractors/content/test_publish_date_schema.json b/tests/data/extractors/publishdate/test_publish_date_schema.json similarity index 100% rename from tests/data/extractors/content/test_publish_date_schema.json rename to tests/data/extractors/publishdate/test_publish_date_schema.json diff --git a/tests/data/extractors/content/test_tags_abcau.html b/tests/data/extractors/tags/test_tags_abcau.html similarity index 100% rename from tests/data/extractors/content/test_tags_abcau.html rename to tests/data/extractors/tags/test_tags_abcau.html diff --git a/tests/data/extractors/content/test_tags_abcau.json b/tests/data/extractors/tags/test_tags_abcau.json similarity index 100% rename from tests/data/extractors/content/test_tags_abcau.json rename to tests/data/extractors/tags/test_tags_abcau.json diff --git a/tests/data/extractors/content/test_tags_cnet.html b/tests/data/extractors/tags/test_tags_cnet.html similarity index 100% rename from tests/data/extractors/content/test_tags_cnet.html rename to tests/data/extractors/tags/test_tags_cnet.html diff --git a/tests/data/extractors/content/test_tags_cnet.json b/tests/data/extractors/tags/test_tags_cnet.json similarity index 100% rename from tests/data/extractors/content/test_tags_cnet.json rename to tests/data/extractors/tags/test_tags_cnet.json diff --git a/tests/data/extractors/content/test_tags_deadline.html b/tests/data/extractors/tags/test_tags_deadline.html similarity index 100% rename from tests/data/extractors/content/test_tags_deadline.html rename to tests/data/extractors/tags/test_tags_deadline.html diff --git a/tests/data/extractors/content/test_tags_deadline.json b/tests/data/extractors/tags/test_tags_deadline.json similarity index 100% rename from tests/data/extractors/content/test_tags_deadline.json rename to tests/data/extractors/tags/test_tags_deadline.json diff --git a/tests/data/extractors/content/test_tags_kexp.html b/tests/data/extractors/tags/test_tags_kexp.html similarity index 100% rename from tests/data/extractors/content/test_tags_kexp.html rename to tests/data/extractors/tags/test_tags_kexp.html diff --git a/tests/data/extractors/content/test_tags_kexp.json b/tests/data/extractors/tags/test_tags_kexp.json similarity index 100% rename from tests/data/extractors/content/test_tags_kexp.json rename to tests/data/extractors/tags/test_tags_kexp.json diff --git a/tests/data/extractors/content/test_tags_wnyc.html b/tests/data/extractors/tags/test_tags_wnyc.html similarity index 100% rename from tests/data/extractors/content/test_tags_wnyc.html rename to tests/data/extractors/tags/test_tags_wnyc.html diff --git a/tests/data/extractors/content/test_tags_wnyc.json b/tests/data/extractors/tags/test_tags_wnyc.json similarity index 100% rename from tests/data/extractors/content/test_tags_wnyc.json rename to tests/data/extractors/tags/test_tags_wnyc.json diff --git a/tests/extractors/base.py b/tests/extractors/base.py index 60990b77..e19d20e0 100644 --- a/tests/extractors/base.py +++ b/tests/extractors/base.py @@ -114,13 +114,13 @@ def _get_current_testname(self): class MockResponseExtractors(MockResponse): def content(self, req): - current_test = self.cls._get_current_testname() + test, suite, module, cls, func = self.cls.id().split('.') path = os.path.join( os.path.dirname(CURRENT_PATH), "data", - "extractors", - "content", - "%s.html" % current_test) + suite, + module, + "%s.html" % func) path = os.path.abspath(path) content = FileHelper.loadResourceFile(path) return content @@ -184,23 +184,6 @@ def assert_cleaned_text(self, field, expected_value, result_value): msg = u"The beginning of the article text was not as expected!" self.assertEqual(expected_value, result_value, msg=msg) - def assert_tags(self, field, expected_value, result_value): - """\ - - """ - # as we have a set in expected_value and a list in result_value - # make result_value a set - expected_value = set(expected_value) - - # check if both have the same number of items - msg = (u"expected tags set and result tags set" - u"don't have the same number of items") - self.assertEqual(len(result_value), len(expected_value), msg=msg) - - # check if each tag in result_value is in expected_value - for tag in result_value: - self.assertTrue(tag in expected_value) - def runArticleAssertions(self, article, fields): """\ diff --git a/tests/extractors/content.py b/tests/extractors/content.py index 950d2208..3eeaf1fe 100644 --- a/tests/extractors/content.py +++ b/tests/extractors/content.py @@ -95,23 +95,6 @@ def assert_cleaned_text(self, field, expected_value, result_value): msg = u"The beginning of the article text was not as expected!" self.assertEqual(expected_value, result_value, msg=msg) - def assert_tags(self, field, expected_value, result_value): - """\ - - """ - # as we have a set in expected_value and a list in result_value - # make result_value a set - expected_value = set(expected_value) - - # check if both have the same number of items - msg = (u"expected tags set and result tags set" - u"don't have the same number of items") - self.assertEqual(len(result_value), len(expected_value), msg=msg) - - # check if each tag in result_value is in expected_value - for tag in result_value: - self.assertTrue(tag in expected_value) - def runArticleAssertions(self, article, fields): """\ @@ -387,25 +370,6 @@ def test_articlebody_tag(self): self.runArticleAssertions(article=article, fields=fields) -class TestPublishDate(TestExtractionBase): - - def test_publish_date(self): - article = self.getArticle() - self.runArticleAssertions(article=article, fields=['publish_date']) - - def test_publish_date_rnews(self): - article = self.getArticle() - self.runArticleAssertions(article=article, fields=['publish_date']) - - def test_publish_date_article(self): - article = self.getArticle() - self.runArticleAssertions(article=article, fields=['publish_date']) - - def test_publish_date_schema(self): - article = self.getArticle() - self.runArticleAssertions(article=article, fields=['publish_date']) - - class TestExtractWithUrl(TestExtractionBase): def test_get_canonical_url(self): @@ -484,34 +448,3 @@ def test_author_schema(self): article = self.getArticle() fields = ['authors'] self.runArticleAssertions(article=article, fields=fields) - - -class TestArticleTags(TestExtractionBase): - - def test_tags_kexp(self): - article = self.getArticle() - fields = ['tags'] - self.runArticleAssertions(article=article, fields=fields) - - def test_tags_deadline(self): - article = self.getArticle() - fields = ['tags'] - self.runArticleAssertions(article=article, fields=fields) - - def test_tags_wnyc(self): - article = self.getArticle() - fields = ['tags'] - self.runArticleAssertions(article=article, fields=fields) - - def test_tags_cnet(self): - article = self.getArticle() - fields = ['tags'] - self.runArticleAssertions(article=article, fields=fields) - - def test_tags_abcau(self): - """ - Test ABC Australia page with "topics" tags - """ - article = self.getArticle() - fields = ['tags'] - self.runArticleAssertions(article=article, fields=fields) diff --git a/tests/extractors/publishdate.py b/tests/extractors/publishdate.py index e69de29b..8d2a13b9 100644 --- a/tests/extractors/publishdate.py +++ b/tests/extractors/publishdate.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from base import TestExtractionBase + + +class TestPublishDate(TestExtractionBase): + + def test_publish_date(self): + article = self.getArticle() + self.runArticleAssertions(article=article, fields=['publish_date']) + + def test_publish_date_rnews(self): + article = self.getArticle() + self.runArticleAssertions(article=article, fields=['publish_date']) + + def test_publish_date_article(self): + article = self.getArticle() + self.runArticleAssertions(article=article, fields=['publish_date']) + + def test_publish_date_schema(self): + article = self.getArticle() + self.runArticleAssertions(article=article, fields=['publish_date']) diff --git a/tests/extractors/tags.py b/tests/extractors/tags.py index e69de29b..22b17129 100644 --- a/tests/extractors/tags.py +++ b/tests/extractors/tags.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from base import TestExtractionBase + + +class TestArticleTags(TestExtractionBase): + + def assert_tags(self, field, expected_value, result_value): + """\ + + """ + # as we have a set in expected_value and a list in result_value + # make result_value a set + expected_value = set(expected_value) + + # check if both have the same number of items + msg = (u"expected tags set and result tags set" + u"don't have the same number of items") + self.assertEqual(len(result_value), len(expected_value), msg=msg) + + # check if each tag in result_value is in expected_value + for tag in result_value: + self.assertTrue(tag in expected_value) + + def test_tags_kexp(self): + article = self.getArticle() + fields = ['tags'] + self.runArticleAssertions(article=article, fields=fields) + + def test_tags_deadline(self): + article = self.getArticle() + fields = ['tags'] + self.runArticleAssertions(article=article, fields=fields) + + def test_tags_wnyc(self): + article = self.getArticle() + fields = ['tags'] + self.runArticleAssertions(article=article, fields=fields) + + def test_tags_cnet(self): + article = self.getArticle() + fields = ['tags'] + self.runArticleAssertions(article=article, fields=fields) + + def test_tags_abcau(self): + """ + Test ABC Australia page with "topics" tags + """ + article = self.getArticle() + fields = ['tags'] + self.runArticleAssertions(article=article, fields=fields) diff --git a/tests/extractors/videos.py b/tests/extractors/videos.py index 389a414c..23d1670d 100644 --- a/tests/extractors/videos.py +++ b/tests/extractors/videos.py @@ -30,26 +30,10 @@ CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) -class MockResponseVideos(MockResponse): - def content(self, req): - current_test = self.cls._get_current_testname() - path = os.path.join( - os.path.dirname(CURRENT_PATH), - "data", - "extractors", - "videos", - "%s.html" % current_test) - path = os.path.abspath(path) - content = FileHelper.loadResourceFile(path) - return content - - class ImageExtractionTests(TestExtractionBase): """\ Base Mock test case """ - callback = MockResponseVideos - def assert_movies(self, field, expected_value, result_value): # check if result_value is a list self.assertTrue(isinstance(result_value, list)) From b762ea8dca09b9fd6c3c29ab0b932389d91696b1 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 06:56:28 +0100 Subject: [PATCH 087/100] #188 - move tweets tests case --- .../{content => tweets}/test_tweet.html | 0 .../{content => tweets}/test_tweet.json | 0 tests/extractors/content.py | 9 ----- tests/extractors/tweets.py | 33 +++++++++++++++++++ 4 files changed, 33 insertions(+), 9 deletions(-) rename tests/data/extractors/{content => tweets}/test_tweet.html (100%) rename tests/data/extractors/{content => tweets}/test_tweet.json (100%) diff --git a/tests/data/extractors/content/test_tweet.html b/tests/data/extractors/tweets/test_tweet.html similarity index 100% rename from tests/data/extractors/content/test_tweet.html rename to tests/data/extractors/tweets/test_tweet.html diff --git a/tests/data/extractors/content/test_tweet.json b/tests/data/extractors/tweets/test_tweet.json similarity index 100% rename from tests/data/extractors/content/test_tweet.json rename to tests/data/extractors/tweets/test_tweet.json diff --git a/tests/extractors/content.py b/tests/extractors/content.py index 3eeaf1fe..35b13f20 100644 --- a/tests/extractors/content.py +++ b/tests/extractors/content.py @@ -424,15 +424,6 @@ def extract(self, instance): return article -class TestArticleTweet(TestExtractionBase): - - def test_tweet(self): - article = self.getArticle() - number_tweets = len(article.tweets) - expected_number_tweets = self.data['expected']['tweets'] - self.assertEqual(number_tweets, expected_number_tweets) - - class TestArticleLinks(TestExtractionBase): def test_links(self): diff --git a/tests/extractors/tweets.py b/tests/extractors/tweets.py index e69de29b..50300f43 100644 --- a/tests/extractors/tweets.py +++ b/tests/extractors/tweets.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from base import TestExtractionBase + + +class TestArticleTweet(TestExtractionBase): + + def test_tweet(self): + article = self.getArticle() + number_tweets = len(article.tweets) + expected_number_tweets = self.data['expected']['tweets'] + self.assertEqual(number_tweets, expected_number_tweets) From ea693a917829a2f5f83815fd4798bf8b21d48eb4 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 07:06:12 +0100 Subject: [PATCH 088/100] #188 - test refactor --- .../{content => links}/test_links.html | 0 .../{content => links}/test_links.json | 0 .../test_opengraph.html | 0 .../test_opengraph.json | 0 tests/extractors/content.py | 135 +----------------- tests/extractors/links.py | 33 +++++ tests/extractors/opengraph.py | 32 +++++ tests/extractors/videos.py | 7 - 8 files changed, 66 insertions(+), 141 deletions(-) rename tests/data/extractors/{content => links}/test_links.html (100%) rename tests/data/extractors/{content => links}/test_links.json (100%) rename tests/data/extractors/{content => opengraph}/test_opengraph.html (100%) rename tests/data/extractors/{content => opengraph}/test_opengraph.json (100%) create mode 100644 tests/extractors/links.py diff --git a/tests/data/extractors/content/test_links.html b/tests/data/extractors/links/test_links.html similarity index 100% rename from tests/data/extractors/content/test_links.html rename to tests/data/extractors/links/test_links.html diff --git a/tests/data/extractors/content/test_links.json b/tests/data/extractors/links/test_links.json similarity index 100% rename from tests/data/extractors/content/test_links.json rename to tests/data/extractors/links/test_links.json diff --git a/tests/data/extractors/content/test_opengraph.html b/tests/data/extractors/opengraph/test_opengraph.html similarity index 100% rename from tests/data/extractors/content/test_opengraph.html rename to tests/data/extractors/opengraph/test_opengraph.html diff --git a/tests/data/extractors/content/test_opengraph.json b/tests/data/extractors/opengraph/test_opengraph.json similarity index 100% rename from tests/data/extractors/content/test_opengraph.json rename to tests/data/extractors/opengraph/test_opengraph.json diff --git a/tests/extractors/content.py b/tests/extractors/content.py index 35b13f20..5b287f18 100644 --- a/tests/extractors/content.py +++ b/tests/extractors/content.py @@ -20,130 +20,11 @@ See the License for the specific language governing permissions and limitations under the License. """ -import os -import json +from base import TestExtractionBase -from base import BaseMockTests -from base import MockResponseExtractors - -from goose import Goose -from goose.configuration import Configuration from goose.text import StopWordsChinese from goose.text import StopWordsArabic from goose.text import StopWordsKorean -from goose.utils import FileHelper - - -CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) - - -class TestExtractionBase(BaseMockTests): - """\ - Extraction test case - """ - callback = MockResponseExtractors - - def getRawHtml(self): - test, suite, module, cls, func = self.id().split('.') - path = os.path.join( - os.path.dirname(CURRENT_PATH), - "data", - suite, - module, - "%s.html" % func) - path = os.path.abspath(path) - content = FileHelper.loadResourceFile(path) - return content - - def loadData(self): - """\ - - """ - test, suite, module, cls, func = self.id().split('.') - path = os.path.join( - os.path.dirname(CURRENT_PATH), - "data", - suite, - module, - "%s.json" % func) - path = os.path.abspath(path) - content = FileHelper.loadResourceFile(path) - self.data = json.loads(content) - - def assert_cleaned_text(self, field, expected_value, result_value): - """\ - - """ - # # TODO : handle verbose level in tests - # print "\n=======================::. ARTICLE REPORT %s .::======================\n" % self.id() - # print 'expected_value (%s) \n' % len(expected_value) - # print expected_value - # print "-------" - # print 'result_value (%s) \n' % len(result_value) - # print result_value - - # cleaned_text is Null - msg = u"Resulting article text was NULL!" - self.assertNotEqual(result_value, None, msg=msg) - - # cleaned_text length - msg = u"Article text was not as long as expected beginning!" - self.assertTrue(len(expected_value) <= len(result_value), msg=msg) - - # clean_text value - result_value = result_value[0:len(expected_value)] - msg = u"The beginning of the article text was not as expected!" - self.assertEqual(expected_value, result_value, msg=msg) - - def runArticleAssertions(self, article, fields): - """\ - - """ - for field in fields: - expected_value = self.data['expected'][field] - result_value = getattr(article, field, None) - - # custom assertion for a given field - assertion = 'assert_%s' % field - if hasattr(self, assertion): - getattr(self, assertion)(field, expected_value, result_value) - continue - - # default assertion - msg = u"Error %s \nexpected: %s\nresult: %s" % (field, expected_value, result_value) - self.assertEqual(expected_value, result_value, msg=msg) - - def extract(self, instance): - article = instance.extract(url=self.data['url']) - return article - - def getConfig(self): - config = Configuration() - config.enable_image_fetching = False - return config - - def getArticle(self): - """\ - - """ - # load test case data - self.loadData() - - # basic configuration - # no image fetching - config = self.getConfig() - self.parser = config.get_parser() - - # target language - # needed for non english language most of the time - target_language = self.data.get('target_language') - if target_language: - config.target_language = target_language - config.use_meta_language = False - - # run goose - g = Goose(config=config) - return self.extract(g) class TestExtractions(TestExtractionBase): @@ -330,11 +211,6 @@ def test_okaymarketing(self): fields = ['cleaned_text'] self.runArticleAssertions(article=article, fields=fields) - def test_opengraph(self): - article = self.getArticle() - fields = ['opengraph'] - self.runArticleAssertions(article=article, fields=fields) - def test_title_opengraph(self): article = self.getArticle() fields = ['title'] @@ -424,15 +300,6 @@ def extract(self, instance): return article -class TestArticleLinks(TestExtractionBase): - - def test_links(self): - article = self.getArticle() - number_links = len(article.links) - expected_number_links = self.data['expected']['links'] - self.assertEqual(number_links, expected_number_links) - - class TestArticleAuthor(TestExtractionBase): def test_author_schema(self): diff --git a/tests/extractors/links.py b/tests/extractors/links.py new file mode 100644 index 00000000..8539465e --- /dev/null +++ b/tests/extractors/links.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from base import TestExtractionBase + + +class TestArticleLinks(TestExtractionBase): + + def test_links(self): + article = self.getArticle() + number_links = len(article.links) + expected_number_links = self.data['expected']['links'] + self.assertEqual(number_links, expected_number_links) diff --git a/tests/extractors/opengraph.py b/tests/extractors/opengraph.py index e69de29b..415a784c 100644 --- a/tests/extractors/opengraph.py +++ b/tests/extractors/opengraph.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from base import TestExtractionBase + + +class TestOpenGraph(TestExtractionBase): + + def test_opengraph(self): + article = self.getArticle() + fields = ['opengraph'] + self.runArticleAssertions(article=article, fields=fields) diff --git a/tests/extractors/videos.py b/tests/extractors/videos.py index 23d1670d..10be15ff 100644 --- a/tests/extractors/videos.py +++ b/tests/extractors/videos.py @@ -20,15 +20,8 @@ See the License for the specific language governing permissions and limitations under the License. """ -import os - -from base import MockResponse from base import TestExtractionBase -from goose.utils import FileHelper - -CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) - class ImageExtractionTests(TestExtractionBase): """\ From 41e951ce3be2ab5c29bd7a9d24f5e2ee391f02d9 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 07:18:05 +0100 Subject: [PATCH 089/100] #188 - move authors tests --- .../test_author_schema.html | 0 .../test_author_schema.json | 0 tests/extractors/authors.py | 32 +++++++++++++++++++ tests/extractors/content.py | 8 ----- 4 files changed, 32 insertions(+), 8 deletions(-) rename tests/data/extractors/{content => authors}/test_author_schema.html (100%) rename tests/data/extractors/{content => authors}/test_author_schema.json (100%) diff --git a/tests/data/extractors/content/test_author_schema.html b/tests/data/extractors/authors/test_author_schema.html similarity index 100% rename from tests/data/extractors/content/test_author_schema.html rename to tests/data/extractors/authors/test_author_schema.html diff --git a/tests/data/extractors/content/test_author_schema.json b/tests/data/extractors/authors/test_author_schema.json similarity index 100% rename from tests/data/extractors/content/test_author_schema.json rename to tests/data/extractors/authors/test_author_schema.json diff --git a/tests/extractors/authors.py b/tests/extractors/authors.py index e69de29b..709040c1 100644 --- a/tests/extractors/authors.py +++ b/tests/extractors/authors.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from base import TestExtractionBase + + +class TestArticleAuthor(TestExtractionBase): + + def test_author_schema(self): + article = self.getArticle() + fields = ['authors'] + self.runArticleAssertions(article=article, fields=fields) diff --git a/tests/extractors/content.py b/tests/extractors/content.py index 5b287f18..1e940ee9 100644 --- a/tests/extractors/content.py +++ b/tests/extractors/content.py @@ -298,11 +298,3 @@ class TestExtractionsRaw(TestExtractions): def extract(self, instance): article = instance.extract(raw_html=self.getRawHtml()) return article - - -class TestArticleAuthor(TestExtractionBase): - - def test_author_schema(self): - article = self.getArticle() - fields = ['authors'] - self.runArticleAssertions(article=article, fields=fields) From 9be09b8a8dd6a07bf59b5e7e0a3565267606a89a Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 07:22:19 +0100 Subject: [PATCH 090/100] #188 - move title tests --- .../test_title_opengraph.html | 0 .../test_title_opengraph.json | 0 tests/extractors/content.py | 5 --- tests/extractors/title.py | 32 +++++++++++++++++++ 4 files changed, 32 insertions(+), 5 deletions(-) rename tests/data/extractors/{content => title}/test_title_opengraph.html (100%) rename tests/data/extractors/{content => title}/test_title_opengraph.json (100%) diff --git a/tests/data/extractors/content/test_title_opengraph.html b/tests/data/extractors/title/test_title_opengraph.html similarity index 100% rename from tests/data/extractors/content/test_title_opengraph.html rename to tests/data/extractors/title/test_title_opengraph.html diff --git a/tests/data/extractors/content/test_title_opengraph.json b/tests/data/extractors/title/test_title_opengraph.json similarity index 100% rename from tests/data/extractors/content/test_title_opengraph.json rename to tests/data/extractors/title/test_title_opengraph.json diff --git a/tests/extractors/content.py b/tests/extractors/content.py index 1e940ee9..30dc2754 100644 --- a/tests/extractors/content.py +++ b/tests/extractors/content.py @@ -211,11 +211,6 @@ def test_okaymarketing(self): fields = ['cleaned_text'] self.runArticleAssertions(article=article, fields=fields) - def test_title_opengraph(self): - article = self.getArticle() - fields = ['title'] - self.runArticleAssertions(article=article, fields=fields) - def test_issue129(self): article = self.getArticle() fields = ['cleaned_text'] diff --git a/tests/extractors/title.py b/tests/extractors/title.py index e69de29b..36bee9a2 100644 --- a/tests/extractors/title.py +++ b/tests/extractors/title.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from base import TestExtractionBase + + +class TestTitle(TestExtractionBase): + + def test_title_opengraph(self): + article = self.getArticle() + fields = ['title'] + self.runArticleAssertions(article=article, fields=fields) From 6959185a8f72d9d94d5b01fc33ecb3439a3e3fb0 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 07:30:42 +0100 Subject: [PATCH 091/100] #188 - add empty meta test case --- tests/extractors/metas.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/extractors/metas.py b/tests/extractors/metas.py index e69de29b..fd45915a 100644 --- a/tests/extractors/metas.py +++ b/tests/extractors/metas.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from base import TestExtractionBase + + +class TestMetas(TestExtractionBase): + + pass From ca1d8240246c64bfc8c4e5a838f5c1ba1aa33471 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Wed, 31 Dec 2014 07:37:42 +0100 Subject: [PATCH 092/100] bump version --- goose/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/version.py b/goose/version.py index 875065c7..c8718138 100644 --- a/goose/version.py +++ b/goose/version.py @@ -21,5 +21,5 @@ limitations under the License. """ -version_info = (1, 0, 23) +version_info = (1, 0, 24) __version__ = ".".join(map(str, version_info)) From f9f1f1db5015a4819d8597061e41eef652c747a5 Mon Sep 17 00:00:00 2001 From: randvis Date: Fri, 2 Jan 2015 22:51:42 +0800 Subject: [PATCH 093/100] 191 - keep available parsers list unchanged during multiple extract() calls --- goose/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/__init__.py b/goose/__init__.py index 49073bd1..409b5732 100644 --- a/goose/__init__.py +++ b/goose/__init__.py @@ -59,7 +59,7 @@ def shutdown_network(self): pass def crawl(self, crawl_candiate): - parsers = self.config.available_parsers + parsers = list(self.config.available_parsers) parsers.remove(self.config.parser_class) try: crawler = Crawler(self.config) From c583da286c710ee321055fc81b3b610ae0ceafec Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Sat, 3 Jan 2015 10:30:45 +0100 Subject: [PATCH 094/100] bump version --- goose/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/version.py b/goose/version.py index c8718138..fedcbb6d 100644 --- a/goose/version.py +++ b/goose/version.py @@ -21,5 +21,5 @@ limitations under the License. """ -version_info = (1, 0, 24) +version_info = (1, 0, 25) __version__ = ".".join(map(str, version_info)) From 7981697c3704bb19b1b6a618300a5cd517ab16f9 Mon Sep 17 00:00:00 2001 From: Nathan Breit Date: Tue, 20 Jan 2015 21:47:57 +0800 Subject: [PATCH 095/100] Check for empty title --- goose/extractors/title.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/goose/extractors/title.py b/goose/extractors/title.py index 8104c52b..092471f2 100644 --- a/goose/extractors/title.py +++ b/goose/extractors/title.py @@ -50,6 +50,11 @@ def clean_title(self, title): # TechCrunch | my wonderfull article # my wonderfull article | TechCrunch title_words = title.split() + + # check for an empty title + # so that we don't get an IndexError below + if len(title_words) == 0: + return u"" # check if first letter is in TITLE_SPLITTERS # if so remove it From 3bf8f5ec4e0c2dcad12437a7acb5b510d58e35e6 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Sat, 24 Jan 2015 21:18:41 +0100 Subject: [PATCH 096/100] #199 - pep8 --- goose/extractors/title.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/extractors/title.py b/goose/extractors/title.py index 092471f2..31d69840 100644 --- a/goose/extractors/title.py +++ b/goose/extractors/title.py @@ -50,7 +50,7 @@ def clean_title(self, title): # TechCrunch | my wonderfull article # my wonderfull article | TechCrunch title_words = title.split() - + # check for an empty title # so that we don't get an IndexError below if len(title_words) == 0: From aee045dc2bee3a252bb13b8a00e6317371c781f3 Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Sat, 24 Jan 2015 21:19:04 +0100 Subject: [PATCH 097/100] #199 - test for empty title --- tests/data/extractors/title/test_title_empty.html | 12 ++++++++++++ tests/data/extractors/title/test_title_empty.json | 6 ++++++ tests/extractors/title.py | 5 +++++ 3 files changed, 23 insertions(+) create mode 100644 tests/data/extractors/title/test_title_empty.html create mode 100644 tests/data/extractors/title/test_title_empty.json diff --git a/tests/data/extractors/title/test_title_empty.html b/tests/data/extractors/title/test_title_empty.html new file mode 100644 index 00000000..63a8cab9 --- /dev/null +++ b/tests/data/extractors/title/test_title_empty.html @@ -0,0 +1,12 @@ + + + + + +
    +

    + TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

    +
    + + diff --git a/tests/data/extractors/title/test_title_empty.json b/tests/data/extractors/title/test_title_empty.json new file mode 100644 index 00000000..c31bab9f --- /dev/null +++ b/tests/data/extractors/title/test_title_empty.json @@ -0,0 +1,6 @@ +{ + "url": "http://exemple.com/test_title_empty.html", + "expected": { + "title": "" + } +} diff --git a/tests/extractors/title.py b/tests/extractors/title.py index 36bee9a2..09170205 100644 --- a/tests/extractors/title.py +++ b/tests/extractors/title.py @@ -30,3 +30,8 @@ def test_title_opengraph(self): article = self.getArticle() fields = ['title'] self.runArticleAssertions(article=article, fields=fields) + + def test_title_empty(self): + article = self.getArticle() + fields = ['title'] + self.runArticleAssertions(article=article, fields=fields) From cc9d892139cad23b98d43f267cf4ab620a63cb52 Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Thu, 19 Feb 2015 12:35:13 +0000 Subject: [PATCH 098/100] Tidy README.rst Minor typo fixes. --- README.rst | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/README.rst b/README.rst index 86f3cf7a..5dc8ab0b 100644 --- a/README.rst +++ b/README.rst @@ -5,9 +5,9 @@ Intro ----- Goose was originally an article extractor written in Java that has most -recently (aug2011) been converted to a `scala project `_. +recently (Aug2011) been converted to a `scala project `_. -This is a complete rewrite in python. The aim of the software is to +This is a complete rewrite in Python. The aim of the software is to take any news article or article-type web page and not only extract what is the main body of the article but also all meta data and most probable image candidate. @@ -16,11 +16,11 @@ Goose will try to extract the following information: - Main text of an article - Main image of article -- Any Youtube/Vimeo movies embedded in article +- Any YouTube/Vimeo movies embedded in article - Meta Description - Meta tags -The python version was rewritten by: +The Python version was rewritten by: - Xavier Grangier @@ -28,10 +28,10 @@ Licensing --------- If you find Goose useful or have issues please drop me a line. I'd love -to hear how you're using it or what features should be improved +to hear how you're using it or what features should be improved. -Goose is licensed by Gravity.com under the Apache 2.0 license, see the -LICENSE file for more details +Goose is licensed by Gravity.com under the Apache 2.0 license; see the +LICENSE file for more details. Setup ----- @@ -70,13 +70,13 @@ pass goose a Configuration() object. The second one is to pass a configuration dict. For instance, if you want to change the userAgent used by Goose just -pass : +pass: :: >>> g = Goose({'browser_user_agent': 'Mozilla'}) -Switching parsers : Goose can now be use with lxml html parser or lxml +Switching parsers : Goose can now be used with lxml html parser or lxml soup parser. By default the html parser is used. If you want to use the soup parser pass it in the configuration dict : @@ -87,8 +87,8 @@ soup parser pass it in the configuration dict : Goose is now language aware --------------------------- -For example scrapping a Spanish content page with correct meta language -tags +For example, scraping a Spanish content page with correct meta language +tags: :: @@ -114,7 +114,7 @@ configuration : u'Importante golpe a la banda terrorista ETA en Francia. La Guardia Civil ha detenido en un hotel de Macon, a 70 kil\xf3metros de Lyon, a Izaskun Lesaka y ' Passing {'use\_meta\_language': False, 'target\_language':'es'} will -force as configuration will force the spanish language +forcibly select Spanish. Video extraction @@ -146,7 +146,7 @@ Goose in Chinese Some users want to use Goose for Chinese content. Chinese word segmentation is way more difficult to deal with than occidental languages. Chinese needs a dedicated StopWord analyser that need to be -passed to the config object +passed to the config object. :: @@ -202,7 +202,7 @@ Known issues ------------ - There are some issues with unicode URLs. -- Cookie handling : Some websites need cookie handling. At the moment the only work around is to use the raw_html extraction. For instance ; +- Cookie handling : Some websites need cookie handling. At the moment the only work around is to use the raw_html extraction. For instance: >>> import urllib2 >>> import goose From 5db0166d03fd12fb5e9abb4e111a01feaeead369 Mon Sep 17 00:00:00 2001 From: Amal Francis Date: Wed, 4 Mar 2015 12:41:07 +0530 Subject: [PATCH 099/100] Type fix: Issue #204 --- goose/extractors/title.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/extractors/title.py b/goose/extractors/title.py index 31d69840..a59dca92 100644 --- a/goose/extractors/title.py +++ b/goose/extractors/title.py @@ -32,7 +32,7 @@ class TitleExtractor(BaseExtractor): def clean_title(self, title): """Clean title with the use of og:site_name - in this case try to get ride of site name + in this case try to get rid of site name and use TITLE_SPLITTERS to reformat title """ # check if we have the site name in opengraph data From 09023ec9f5ef26a628a2365616c0a7c864f0ecea Mon Sep 17 00:00:00 2001 From: Xavier Grangier Date: Sun, 29 Mar 2015 16:04:10 +0200 Subject: [PATCH 100/100] #217 - check if content value is not None --- goose/extractors/opengraph.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/goose/extractors/opengraph.py b/goose/extractors/opengraph.py index a52ac349..dc43b4bf 100644 --- a/goose/extractors/opengraph.py +++ b/goose/extractors/opengraph.py @@ -34,5 +34,6 @@ def extract(self): attr = self.parser.getAttribute(meta, 'property') if attr is not None and attr.startswith("og:"): value = self.parser.getAttribute(meta, 'content') - opengraph_dict.update({attr.split(":")[1]: value}) + if value: + opengraph_dict.update({attr.split(":")[1]: value}) return opengraph_dict