diff --git a/.gitignore b/.gitignore index bea68953..4bfadf57 100644 --- a/.gitignore +++ b/.gitignore @@ -8,5 +8,6 @@ env/ *~ .idea ._* +*.egg venv/ goose_extractor.egg-info/ diff --git a/README.rst b/README.rst index 86f3cf7a..5dc8ab0b 100644 --- a/README.rst +++ b/README.rst @@ -5,9 +5,9 @@ Intro ----- Goose was originally an article extractor written in Java that has most -recently (aug2011) been converted to a `scala project `_. +recently (Aug2011) been converted to a `scala project `_. -This is a complete rewrite in python. The aim of the software is to +This is a complete rewrite in Python. The aim of the software is to take any news article or article-type web page and not only extract what is the main body of the article but also all meta data and most probable image candidate. @@ -16,11 +16,11 @@ Goose will try to extract the following information: - Main text of an article - Main image of article -- Any Youtube/Vimeo movies embedded in article +- Any YouTube/Vimeo movies embedded in article - Meta Description - Meta tags -The python version was rewritten by: +The Python version was rewritten by: - Xavier Grangier @@ -28,10 +28,10 @@ Licensing --------- If you find Goose useful or have issues please drop me a line. I'd love -to hear how you're using it or what features should be improved +to hear how you're using it or what features should be improved. -Goose is licensed by Gravity.com under the Apache 2.0 license, see the -LICENSE file for more details +Goose is licensed by Gravity.com under the Apache 2.0 license; see the +LICENSE file for more details. Setup ----- @@ -70,13 +70,13 @@ pass goose a Configuration() object. The second one is to pass a configuration dict. For instance, if you want to change the userAgent used by Goose just -pass : +pass: :: >>> g = Goose({'browser_user_agent': 'Mozilla'}) -Switching parsers : Goose can now be use with lxml html parser or lxml +Switching parsers : Goose can now be used with lxml html parser or lxml soup parser. By default the html parser is used. If you want to use the soup parser pass it in the configuration dict : @@ -87,8 +87,8 @@ soup parser pass it in the configuration dict : Goose is now language aware --------------------------- -For example scrapping a Spanish content page with correct meta language -tags +For example, scraping a Spanish content page with correct meta language +tags: :: @@ -114,7 +114,7 @@ configuration : u'Importante golpe a la banda terrorista ETA en Francia. La Guardia Civil ha detenido en un hotel de Macon, a 70 kil\xf3metros de Lyon, a Izaskun Lesaka y ' Passing {'use\_meta\_language': False, 'target\_language':'es'} will -force as configuration will force the spanish language +forcibly select Spanish. Video extraction @@ -146,7 +146,7 @@ Goose in Chinese Some users want to use Goose for Chinese content. Chinese word segmentation is way more difficult to deal with than occidental languages. Chinese needs a dedicated StopWord analyser that need to be -passed to the config object +passed to the config object. :: @@ -202,7 +202,7 @@ Known issues ------------ - There are some issues with unicode URLs. -- Cookie handling : Some websites need cookie handling. At the moment the only work around is to use the raw_html extraction. For instance ; +- Cookie handling : Some websites need cookie handling. At the moment the only work around is to use the raw_html extraction. For instance: >>> import urllib2 >>> import goose diff --git a/goose/__init__.py b/goose/__init__.py index 885dc6e5..409b5732 100644 --- a/goose/__init__.py +++ b/goose/__init__.py @@ -59,8 +59,14 @@ def shutdown_network(self): pass def crawl(self, crawl_candiate): - crawler = Crawler(self.config) - article = crawler.crawl(crawl_candiate) + parsers = list(self.config.available_parsers) + parsers.remove(self.config.parser_class) + try: + crawler = Crawler(self.config) + article = crawler.crawl(crawl_candiate) + except (UnicodeDecodeError, ValueError): + self.config.parser_class = parsers[0] + return self.crawl(crawl_candiate) return article def initialize(self): diff --git a/goose/article.py b/goose/article.py index d195f166..e3f522f5 100644 --- a/goose/article.py +++ b/goose/article.py @@ -26,7 +26,7 @@ class Article(object): def __init__(self): # title of the article - self.title = None + self.title = u"" # stores the lovely, pure text from the article, # stripped of html, formatting, etc... @@ -62,12 +62,24 @@ def __init__(self): # holds a set of tags that may have # been in the artcle, these are not meta keywords - self.tags = set() + self.tags = [] + + # holds a dict of all opengrah data found + self.opengraph = {} + + # holds twitter embeds + self.tweets = [] # holds a list of any movies # we found on the page like youtube, vimeo self.movies = [] + # holds links found in the main article + self.links = [] + + # hold author names + self.authors = [] + # stores the final URL that we're going to try # and fetch content against, this would be expanded if any self.final_url = u"" @@ -94,3 +106,48 @@ def __init__(self): # A property bucket for consumers of goose to store custom data extractions. self.additional_data = {} + + @property + def infos(self): + data = { + "meta": { + "description": self.meta_description, + "lang": self.meta_lang, + "keywords": self.meta_keywords, + "favicon": self.meta_favicon, + "canonical": self.canonical_link, + }, + "image": None, + "domain": self.domain, + "title": self.title, + "cleaned_text": self.cleaned_text, + "opengraph": self.opengraph, + "tags": self.tags, + "tweets": self.tweets, + "movies": [], + "links": self.links, + "authors": self.authors, + "publish_date": self.publish_date + } + + # image + if self.top_image is not None: + data['image'] = { + 'url': self.top_image.src, + 'width': self.top_image.width, + 'height': self.top_image.height, + 'type': 'image' + } + + # movies + for movie in self.movies: + data['movies'].append({ + 'embed_type': movie.embed_type, + 'provider': movie.provider, + 'width': movie.width, + 'height': movie.height, + 'embed_code': movie.embed_code, + 'src': movie.src, + }) + + return data diff --git a/goose/cleaners.py b/goose/cleaners.py index 2e8bc87a..c1384ee0 100644 --- a/goose/cleaners.py +++ b/goose/cleaners.py @@ -246,7 +246,8 @@ def div_to_para(self, doc, dom_type): bad_divs += 1 elif div is not None: replaceNodes = self.get_replacement_nodes(doc, div) - div.clear() + for child in self.parser.childNodes(div): + div.remove(child) for c, n in enumerate(replaceNodes): div.insert(c, n) diff --git a/goose/configuration.py b/goose/configuration.py index 42696f58..fcfa5b9a 100644 --- a/goose/configuration.py +++ b/goose/configuration.py @@ -29,6 +29,11 @@ HTTP_DEFAULT_TIMEOUT = 30 +AVAILABLE_PARSERS = { + 'lxml': Parser, + 'soup': ParserSoup, +} + class Configuration(object): @@ -84,6 +89,7 @@ def __init__(self): self.additional_data_extractor = None # Parser type + self.available_parsers = AVAILABLE_PARSERS.keys() self.parser_class = 'lxml' # set the local storage path @@ -94,19 +100,7 @@ def __init__(self): self.http_timeout = HTTP_DEFAULT_TIMEOUT def get_parser(self): - return Parser if self.parser_class == 'lxml' else ParserSoup - - def get_publishdate_extractor(self): - return self.extract_publishdate - - def set_publishdate_extractor(self, extractor): - """\ - Pass in to extract article publish dates. - @param extractor a concrete instance of PublishDateExtractor - """ - if not extractor: - raise ValueError("extractor must not be null!") - self.extract_publishdate = extractor + return AVAILABLE_PARSERS[self.parser_class] def get_additionaldata_extractor(self): return self.additional_data_extractor diff --git a/goose/crawler.py b/goose/crawler.py index 211d410e..34daf048 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -25,11 +25,20 @@ from copy import deepcopy from goose.article import Article from goose.utils import URLHelper, RawHelper -from goose.extractors import StandardContentExtractor +from goose.extractors.content import StandardContentExtractor +from goose.extractors.videos import VideoExtractor +from goose.extractors.title import TitleExtractor +from goose.extractors.images import ImageExtractor +from goose.extractors.links import LinksExtractor +from goose.extractors.tweets import TweetsExtractor +from goose.extractors.authors import AuthorsExtractor +from goose.extractors.tags import TagsExtractor +from goose.extractors.opengraph import OpenGraphExtractor +from goose.extractors.publishdate import PublishDateExtractor +from goose.extractors.metas import MetasExtractor from goose.cleaners import StandardDocumentCleaner from goose.outputformatters import StandardOutputFormatter -from goose.images.extractors import UpgradedImageIExtractor -from goose.videos.extractors import VideoExtractor + from goose.network import HtmlFetcher @@ -63,9 +72,33 @@ def __init__(self, config): # init the output formatter self.formatter = self.get_formatter() + # metas extractor + self.metas_extractor = self.get_metas_extractor() + + # publishdate extractor + self.publishdate_extractor = self.get_publishdate_extractor() + + # opengraph extractor + self.opengraph_extractor = self.get_opengraph_extractor() + + # tags extractor + self.tags_extractor = self.get_tags_extractor() + + # authors extractor + self.authors_extractor = self.get_authors_extractor() + + # tweets extractor + self.tweets_extractor = self.get_tweets_extractor() + + # links extractor + self.links_extractor = self.get_links_extractor() + # video extractor self.video_extractor = self.get_video_extractor() + # title extractor + self.title_extractor = self.get_title_extractor() + # image extrator self.image_extractor = self.get_image_extractor() @@ -95,17 +128,37 @@ def crawl(self, crawl_candidate): self.article.raw_html = raw_html self.article.doc = doc self.article.raw_doc = deepcopy(doc) - # TODO - # self.article.publish_date = config.publishDateExtractor.extract(doc) - # self.article.additional_data = config.get_additionaldata_extractor.extract(doc) - self.article.title = self.extractor.get_title() - self.article.meta_lang = self.extractor.get_meta_lang() - self.article.meta_favicon = self.extractor.get_favicon() - self.article.meta_description = self.extractor.get_meta_description() - self.article.meta_keywords = self.extractor.get_meta_keywords() - self.article.canonical_link = self.extractor.get_canonical_link() - self.article.domain = self.extractor.get_domain() - self.article.tags = self.extractor.extract_tags() + + # open graph + self.article.opengraph = self.opengraph_extractor.extract() + + # publishdate + self.article.publish_date = self.publishdate_extractor.extract() + + # meta + metas = self.metas_extractor.extract() + self.article.meta_lang = metas['lang'] + self.article.meta_favicon = metas['favicon'] + self.article.meta_description = metas['description'] + self.article.meta_keywords = metas['keywords'] + self.article.canonical_link = metas['canonical'] + self.article.domain = metas['domain'] + + # tags + self.article.tags = self.tags_extractor.extract() + + # authors + self.article.authors = self.authors_extractor.extract() + + # title + self.article.title = self.title_extractor.extract() + + # check for known node as content body + # if we find one force the article.doc to be the found node + # this will prevent the cleaner to remove unwanted text content + article_body = self.extractor.get_known_article_tags() + if article_body is not None: + self.article.doc = article_body # before we do any calcs on the body itself let's clean up the document self.article.doc = self.cleaner.clean() @@ -117,10 +170,16 @@ def crawl(self, crawl_candidate): # let's process it if self.article.top_node is not None: - # video handeling + # article links + self.article.links = self.links_extractor.extract() + + # tweets + self.article.tweets = self.tweets_extractor.extract() + + # video handling self.video_extractor.get_videos() - # image handeling + # image handling if self.config.enable_image_fetching: self.get_image() @@ -160,8 +219,32 @@ def get_html(self, crawl_candidate, parsing_candidate): }) return html + def get_metas_extractor(self): + return MetasExtractor(self.config, self.article) + + def get_publishdate_extractor(self): + return PublishDateExtractor(self.config, self.article) + + def get_opengraph_extractor(self): + return OpenGraphExtractor(self.config, self.article) + + def get_tags_extractor(self): + return TagsExtractor(self.config, self.article) + + def get_authors_extractor(self): + return AuthorsExtractor(self.config, self.article) + + def get_tweets_extractor(self): + return TweetsExtractor(self.config, self.article) + + def get_links_extractor(self): + return LinksExtractor(self.config, self.article) + + def get_title_extractor(self): + return TitleExtractor(self.config, self.article) + def get_image_extractor(self): - return UpgradedImageIExtractor(self.config, self.article) + return ImageExtractor(self.config, self.article) def get_video_extractor(self): return VideoExtractor(self.config, self.article) diff --git a/goose/extractors/__init__.py b/goose/extractors/__init__.py new file mode 100644 index 00000000..5554efbc --- /dev/null +++ b/goose/extractors/__init__.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + + +class BaseExtractor(object): + + def __init__(self, config, article): + # config + self.config = config + + # parser + self.parser = self.config.get_parser() + + # article + self.article = article + + # stopwords class + self.stopwords_class = config.stopwords_class diff --git a/goose/extractors/authors.py b/goose/extractors/authors.py new file mode 100644 index 00000000..414f4eea --- /dev/null +++ b/goose/extractors/authors.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from goose.extractors import BaseExtractor + + +class AuthorsExtractor(BaseExtractor): + + def extract(self): + authors = [] + author_nodes = self.parser.getElementsByTag( + self.article.doc, + attr='itemprop', + value='author') + + for author in author_nodes: + name_nodes = self.parser.getElementsByTag( + author, + attr='itemprop', + value='name') + + if len(name_nodes) > 0: + name = self.parser.getText(name_nodes[0]) + authors.append(name) + + return list(set(authors)) diff --git a/goose/extractors.py b/goose/extractors/content.py similarity index 62% rename from goose/extractors.py rename to goose/extractors/content.py index 1c8a37f1..e0703d55 100644 --- a/goose/extractors.py +++ b/goose/extractors/content.py @@ -20,217 +20,56 @@ See the License for the specific language governing permissions and limitations under the License. """ -import re from copy import deepcopy -from urlparse import urlparse, urljoin -from goose.utils import StringSplitter -from goose.utils import StringReplacement -from goose.utils import ReplaceSequence -MOTLEY_REPLACEMENT = StringReplacement("�", "") -ESCAPED_FRAGMENT_REPLACEMENT = StringReplacement(u"#!", u"?_escaped_fragment_=") -TITLE_REPLACEMENTS = ReplaceSequence().create(u"»").append(u"»") -PIPE_SPLITTER = StringSplitter("\\|") -DASH_SPLITTER = StringSplitter(" - ") -ARROWS_SPLITTER = StringSplitter("»") -COLON_SPLITTER = StringSplitter(":") -SPACE_SPLITTER = StringSplitter(' ') -NO_STRINGS = set() -A_REL_TAG_SELECTOR = "a[rel=tag]" -A_HREF_TAG_SELECTOR = "a[href*='/tag/'], a[href*='/tags/'], a[href*='/topic/'], a[href*='?keyword=']" -RE_LANG = r'^[A-Za-z]{2}$' +from goose.extractors import BaseExtractor -class ContentExtractor(object): +KNOWN_ARTICLE_CONTENT_TAGS = [ + {'attr': 'itemprop', 'value': 'articleBody'}, + {'attr': 'class', 'value': 'post-content'}, + {'tag': 'article'}, +] - def __init__(self, config, article): - # config - self.config = config - # parser - self.parser = self.config.get_parser() +class ContentExtractor(BaseExtractor): - # article - self.article = article - - # language - self.language = config.target_language - - # stopwords class - self.stopwords_class = config.stopwords_class - - def get_title(self): - """\ - Fetch the article title and analyze it - """ - - title = '' - doc = self.article.doc - - title_element = self.parser.getElementsByTag(doc, tag='title') - # no title found - if title_element is None or len(title_element) == 0: - return title - - # title elem found - title_text = self.parser.getText(title_element[0]) - used_delimeter = False - - # split title with | - if '|' in title_text: - title_text = self.split_title(title_text, PIPE_SPLITTER) - used_delimeter = True - - # split title with - - if not used_delimeter and '-' in title_text: - title_text = self.split_title(title_text, DASH_SPLITTER) - used_delimeter = True - - # split title with » - if not used_delimeter and u'»' in title_text: - title_text = self.split_title(title_text, ARROWS_SPLITTER) - used_delimeter = True - - # split title with : - if not used_delimeter and ':' in title_text: - title_text = self.split_title(title_text, COLON_SPLITTER) - used_delimeter = True - - title = MOTLEY_REPLACEMENT.replaceAll(title_text) - return title - - def split_title(self, title, splitter): - """\ - Split the title to best part possible - """ - large_text_length = 0 - large_text_index = 0 - title_pieces = splitter.split(title) - - # find the largest title piece - for i in range(len(title_pieces)): - current = title_pieces[i] - if len(current) > large_text_length: - large_text_length = len(current) - large_text_index = i - - # replace content - title = title_pieces[large_text_index] - return TITLE_REPLACEMENTS.replaceAll(title).strip() - - def get_favicon(self): - """\ - Extract the favicon from a website - http://en.wikipedia.org/wiki/Favicon - - - """ - kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'icon'} - meta = self.parser.getElementsByTag(self.article.doc, **kwargs) - if meta: - favicon = self.parser.getAttribute(meta[0], 'href') - return favicon - return '' - - def get_meta_lang(self): - """\ - Extract content language from meta - """ - # we have a lang attribute in html - attr = self.parser.getAttribute(self.article.doc, attr='lang') - if attr is None: - # look up for a Content-Language in meta - items = [ - {'tag': 'meta', 'attr': 'http-equiv', 'value': 'content-language'}, - {'tag': 'meta', 'attr': 'name', 'value': 'lang'} - ] - for item in items: - meta = self.parser.getElementsByTag(self.article.doc, **item) - if meta: - attr = self.parser.getAttribute(meta[0], attr='content') - break - - if attr: - value = attr[:2] - if re.search(RE_LANG, value): - return value.lower() - - return None - - def get_meta_content(self, doc, metaName): - """\ - Extract a given meta content form document - """ - meta = self.parser.css_select(doc, metaName) - content = None - - if meta is not None and len(meta) > 0: - content = self.parser.getAttribute(meta[0], 'content') - - if content: - return content.strip() - - return '' - - def get_meta_description(self): - """\ - if the article has meta description set in the source, use that - """ - return self.get_meta_content(self.article.doc, "meta[name=description]") - - def get_meta_keywords(self): + def get_language(self): """\ - if the article has meta keywords set in the source, use that + Returns the language is by the article or + the configuration language """ - return self.get_meta_content(self.article.doc, "meta[name=keywords]") - - def get_canonical_link(self): - """\ - if the article has meta canonical link set in the url - """ - if self.article.final_url: - kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'canonical'} - meta = self.parser.getElementsByTag(self.article.doc, **kwargs) - if meta is not None and len(meta) > 0: - href = self.parser.getAttribute(meta[0], 'href') - if href: - href = href.strip() - o = urlparse(href) - if not o.hostname: - z = urlparse(self.article.final_url) - domain = '%s://%s' % (z.scheme, z.hostname) - href = urljoin(domain, href) - return href - return self.article.final_url - - def get_domain(self): - if self.article.final_url: - o = urlparse(self.article.final_url) - return o.hostname + # we don't want to force the target language + # so we use the article.meta_lang + if self.config.use_meta_language: + if self.article.meta_lang: + return self.article.meta_lang[:2] + return self.config.target_language + + def get_known_article_tags(self): + for item in KNOWN_ARTICLE_CONTENT_TAGS: + nodes = self.parser.getElementsByTag( + self.article.doc, + **item) + if len(nodes): + return nodes[0] return None - def extract_tags(self): - node = self.article.doc - - # node doesn't have chidren - if len(list(node)) == 0: - return NO_STRINGS - - elements = self.parser.css_select(node, A_REL_TAG_SELECTOR) - if not elements: - elements = self.parser.css_select(node, A_HREF_TAG_SELECTOR) - if not elements: - return NO_STRINGS - - tags = [] - for el in elements: - tag = self.parser.getText(el) - if tag: - tags.append(tag) + def is_articlebody(self, node): + for item in KNOWN_ARTICLE_CONTENT_TAGS: + # attribute + if "attr" in item and "value" in item: + if self.parser.getAttribute(node, item['attr']) == item['value']: + return True + # tag + if "tag" in item: + if node.tag == item['tag']: + return True - return set(tags) + return False def calculate_best_node(self): + doc = self.article.doc top_node = None nodes_to_check = self.nodes_to_check(doc) @@ -243,7 +82,7 @@ def calculate_best_node(self): for node in nodes_to_check: text_node = self.parser.getText(node) - word_stats = self.stopwords_class(language=self.language).get_stopword_count(text_node) + word_stats = self.stopwords_class(language=self.get_language()).get_stopword_count(text_node) high_link_density = self.is_highlink_density(node) if word_stats.get_stopword_count() > 2 and not high_link_density: nodes_with_text.append(node) @@ -269,7 +108,7 @@ def calculate_best_node(self): boost_score = float(5) text_node = self.parser.getText(node) - word_stats = self.stopwords_class(language=self.language).get_stopword_count(text_node) + word_stats = self.stopwords_class(language=self.get_language()).get_stopword_count(text_node) upscore = int(word_stats.get_stopword_count() + boost_score) # parent node @@ -325,7 +164,7 @@ def is_boostable(self, node): if steps_away >= max_stepsaway_from_node: return False paraText = self.parser.getText(current_node) - word_stats = self.stopwords_class(language=self.language).get_stopword_count(paraText) + word_stats = self.stopwords_class(language=self.get_language()).get_stopword_count(paraText) if word_stats.get_stopword_count() > minimum_stopword_count: return True steps_away += 1 @@ -341,6 +180,10 @@ def walk_siblings(self, node): return b def add_siblings(self, top_node): + # in case the extraction used known attributes + # we don't want to add sibilings + if self.is_articlebody(top_node): + return top_node baselinescore_siblings_para = self.get_siblings_score(top_node) results = self.walk_siblings(top_node) for current_node in results: @@ -368,7 +211,7 @@ def get_siblings_content(self, current_sibling, baselinescore_siblings_para): for first_paragraph in potential_paragraphs: text = self.parser.getText(first_paragraph) if len(text) > 0: - word_stats = self.stopwords_class(language=self.language).get_stopword_count(text) + word_stats = self.stopwords_class(language=self.get_language()).get_stopword_count(text) paragraph_score = word_stats.get_stopword_count() sibling_baseline_score = float(.30) high_link_density = self.is_highlink_density(first_paragraph) @@ -395,7 +238,7 @@ def get_siblings_score(self, top_node): for node in nodes_to_check: text_node = self.parser.getText(node) - word_stats = self.stopwords_class(language=self.language).get_stopword_count(text_node) + word_stats = self.stopwords_class(language=self.get_language()).get_stopword_count(text_node) high_link_density = self.is_highlink_density(node) if word_stats.get_stopword_count() > 2 and not high_link_density: paragraphs_number += 1 @@ -478,6 +321,7 @@ def nodes_to_check(self, doc): on like paragraphs and tables """ nodes_to_check = [] + for tag in ['p', 'pre', 'td']: items = self.parser.getElementsByTag(doc, tag=tag) nodes_to_check += items diff --git a/goose/images/extractors.py b/goose/extractors/images.py similarity index 97% rename from goose/images/extractors.py rename to goose/extractors/images.py index 4372ae8c..3af44f5f 100644 --- a/goose/images/extractors.py +++ b/goose/extractors/images.py @@ -22,10 +22,13 @@ """ import re import os + from urlparse import urlparse, urljoin + +from goose.extractors import BaseExtractor +from goose.image import Image from goose.utils import FileHelper -from goose.images.image import Image -from goose.images.utils import ImageUtils +from goose.utils.images import ImageUtils KNOWN_IMG_DOM_NAMES = [ "yn-story-related-media", @@ -43,24 +46,14 @@ def __init__(self, node, parent_depth, sibling_depth): self.sibling_depth = sibling_depth -class ImageExtractor(object): - pass - - -class UpgradedImageIExtractor(ImageExtractor): +class ImageExtractor(BaseExtractor): def __init__(self, config, article): - self.custom_site_mapping = {} - self.load_customesite_mapping() - - # article - self.article = article + super(ImageExtractor, self).__init__(config, article) - # config - self.config = config + self.custom_site_mapping = {} - # parser - self.parser = self.config.get_parser() + self.load_customesite_mapping() # What's the minimum bytes for an image we'd accept is self.images_min_bytes = 4000 diff --git a/goose/extractors/links.py b/goose/extractors/links.py new file mode 100644 index 00000000..6ba668fe --- /dev/null +++ b/goose/extractors/links.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from goose.extractors import BaseExtractor + + +class LinksExtractor(BaseExtractor): + + def extract(self): + links = [] + items = self.parser.getElementsByTag(self.article.top_node, 'a') + for i in items: + attr = self.parser.getAttribute(i, 'href') + if attr: + links.append(attr) + return links diff --git a/goose/extractors/metas.py b/goose/extractors/metas.py new file mode 100644 index 00000000..95acadd5 --- /dev/null +++ b/goose/extractors/metas.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import re +from urlparse import urljoin +from urlparse import urlparse + +from goose.extractors import BaseExtractor + + +RE_LANG = r'^[A-Za-z]{2}$' + + +class MetasExtractor(BaseExtractor): + + def get_domain(self): + if self.article.final_url: + o = urlparse(self.article.final_url) + return o.hostname + return None + + def get_favicon(self): + """\ + Extract the favicon from a website + http://en.wikipedia.org/wiki/Favicon + + + """ + kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'icon'} + meta = self.parser.getElementsByTag(self.article.doc, **kwargs) + if meta: + favicon = self.parser.getAttribute(meta[0], 'href') + return favicon + return '' + + def get_canonical_link(self): + """\ + if the article has meta canonical link set in the url + """ + if self.article.final_url: + kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'canonical'} + meta = self.parser.getElementsByTag(self.article.doc, **kwargs) + if meta is not None and len(meta) > 0: + href = self.parser.getAttribute(meta[0], 'href') + if href: + href = href.strip() + o = urlparse(href) + if not o.hostname: + z = urlparse(self.article.final_url) + domain = '%s://%s' % (z.scheme, z.hostname) + href = urljoin(domain, href) + return href + return self.article.final_url + + def get_meta_lang(self): + """\ + Extract content language from meta + """ + # we have a lang attribute in html + attr = self.parser.getAttribute(self.article.doc, attr='lang') + if attr is None: + # look up for a Content-Language in meta + items = [ + {'tag': 'meta', 'attr': 'http-equiv', 'value': 'content-language'}, + {'tag': 'meta', 'attr': 'name', 'value': 'lang'} + ] + for item in items: + meta = self.parser.getElementsByTag(self.article.doc, **item) + if meta: + attr = self.parser.getAttribute(meta[0], attr='content') + break + + if attr: + value = attr[:2] + if re.search(RE_LANG, value): + return value.lower() + + return None + + def get_meta_content(self, metaName): + """\ + Extract a given meta content form document + """ + meta = self.parser.css_select(self.article.doc, metaName) + content = None + + if meta is not None and len(meta) > 0: + content = self.parser.getAttribute(meta[0], 'content') + + if content: + return content.strip() + + return '' + + def get_meta_description(self): + """\ + if the article has meta description set in the source, use that + """ + return self.get_meta_content("meta[name=description]") + + def get_meta_keywords(self): + """\ + if the article has meta keywords set in the source, use that + """ + return self.get_meta_content("meta[name=keywords]") + + def extract(self): + return { + "description": self.get_meta_description(), + "keywords": self.get_meta_keywords(), + "lang": self.get_meta_lang(), + "favicon": self.get_favicon(), + "canonical": self.get_canonical_link(), + "domain": self.get_domain() + } diff --git a/goose/extractors/opengraph.py b/goose/extractors/opengraph.py new file mode 100644 index 00000000..dc43b4bf --- /dev/null +++ b/goose/extractors/opengraph.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from goose.extractors import BaseExtractor + + +class OpenGraphExtractor(BaseExtractor): + + def extract(self): + opengraph_dict = {} + node = self.article.doc + metas = self.parser.getElementsByTag(node, 'meta') + for meta in metas: + attr = self.parser.getAttribute(meta, 'property') + if attr is not None and attr.startswith("og:"): + value = self.parser.getAttribute(meta, 'content') + if value: + opengraph_dict.update({attr.split(":")[1]: value}) + return opengraph_dict diff --git a/goose/extractors/publishdate.py b/goose/extractors/publishdate.py new file mode 100644 index 00000000..1768b1a0 --- /dev/null +++ b/goose/extractors/publishdate.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from goose.extractors import BaseExtractor + +KNOWN_PUBLISH_DATE_TAGS = [ + {'attribute': 'property', 'value': 'rnews:datePublished', 'content': 'content'}, + {'attribute': 'property', 'value': 'article:published_time', 'content': 'content'}, + {'attribute': 'name', 'value': 'OriginalPublicationDate', 'content': 'content'}, + {'attribute': 'itemprop', 'value': 'datePublished', 'content': 'datetime'}, +] + + +class PublishDateExtractor(BaseExtractor): + def extract(self): + for known_meta_tag in KNOWN_PUBLISH_DATE_TAGS: + meta_tags = self.parser.getElementsByTag( + self.article.doc, + attr=known_meta_tag['attribute'], + value=known_meta_tag['value']) + if meta_tags: + return self.parser.getAttribute( + meta_tags[0], + known_meta_tag['content'] + ) + return None diff --git a/goose/extractors/tags.py b/goose/extractors/tags.py new file mode 100644 index 00000000..466e7f81 --- /dev/null +++ b/goose/extractors/tags.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from goose.extractors import BaseExtractor + +A_REL_TAG_SELECTOR = "a[rel=tag]" +A_HREF_TAG_SELECTOR = "a[href*='/tag/'], a[href*='/tags/'], a[href*='/topic/'], a[href*='?keyword=']" + + +class TagsExtractor(BaseExtractor): + + def extract(self): + node = self.article.doc + tags = [] + + # node doesn't have chidren + if len(list(node)) == 0: + return tags + + elements = self.parser.css_select(node, A_REL_TAG_SELECTOR) + if not elements: + elements = self.parser.css_select(node, A_HREF_TAG_SELECTOR) + if not elements: + return tags + + for el in elements: + tag = self.parser.getText(el) + if tag: + tags.append(tag) + + return list(set(tags)) diff --git a/goose/extractors/title.py b/goose/extractors/title.py new file mode 100644 index 00000000..a59dca92 --- /dev/null +++ b/goose/extractors/title.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import re + +from goose.extractors import BaseExtractor + + +TITLE_SPLITTERS = [u"|", u"-", u"»", u":"] + + +class TitleExtractor(BaseExtractor): + + def clean_title(self, title): + """Clean title with the use of og:site_name + in this case try to get rid of site name + and use TITLE_SPLITTERS to reformat title + """ + # check if we have the site name in opengraph data + if "site_name" in self.article.opengraph.keys(): + site_name = self.article.opengraph['site_name'] + # remove the site name from title + title = title.replace(site_name, '').strip() + + # try to remove the domain from url + if self.article.domain: + pattern = re.compile(self.article.domain, re.IGNORECASE) + title = pattern.sub("", title).strip() + + # split the title in words + # TechCrunch | my wonderfull article + # my wonderfull article | TechCrunch + title_words = title.split() + + # check for an empty title + # so that we don't get an IndexError below + if len(title_words) == 0: + return u"" + + # check if first letter is in TITLE_SPLITTERS + # if so remove it + if title_words[0] in TITLE_SPLITTERS: + title_words.pop(0) + + # check if last letter is in TITLE_SPLITTERS + # if so remove it + if title_words[-1] in TITLE_SPLITTERS: + title_words.pop(-1) + + # rebuild the title + title = u" ".join(title_words).strip() + + return title + + def get_title(self): + """\ + Fetch the article title and analyze it + """ + title = '' + + # rely on opengraph in case we have the data + if "title" in self.article.opengraph.keys(): + title = self.article.opengraph['title'] + return self.clean_title(title) + + # try to fetch the meta headline + meta_headline = self.parser.getElementsByTag( + self.article.doc, + tag="meta", + attr="name", + value="headline") + if meta_headline is not None and len(meta_headline) > 0: + title = self.parser.getAttribute(meta_headline[0], 'content') + return self.clean_title(title) + + # otherwise use the title meta + title_element = self.parser.getElementsByTag(self.article.doc, tag='title') + if title_element is not None and len(title_element) > 0: + title = self.parser.getText(title_element[0]) + return self.clean_title(title) + + return title + + def extract(self): + return self.get_title() diff --git a/goose/extractors/tweets.py b/goose/extractors/tweets.py new file mode 100644 index 00000000..3c17ad8d --- /dev/null +++ b/goose/extractors/tweets.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from goose.extractors import BaseExtractor + + +class TweetsExtractor(BaseExtractor): + + def extract(self): + tweets = [] + items = self.parser.getElementsByTag( + self.article.top_node, + tag='blockquote', + attr="class", + value="twitter-tweet") + + for i in items: + for attr in ['gravityScore', 'gravityNodes']: + self.parser.delAttribute(i, attr) + tweets.append(self.parser.nodeToString(i)) + + return tweets diff --git a/goose/videos/extractors.py b/goose/extractors/videos.py similarity index 95% rename from goose/videos/extractors.py rename to goose/extractors/videos.py index 71c52895..88fdf20d 100644 --- a/goose/videos/extractors.py +++ b/goose/extractors/videos.py @@ -21,25 +21,19 @@ limitations under the License. """ -from goose.videos.videos import Video +from goose.extractors import BaseExtractor +from goose.video import Video VIDEOS_TAGS = ['iframe', 'embed', 'object', 'video'] VIDEO_PROVIDERS = ['youtube', 'vimeo', 'dailymotion', 'kewego'] -class VideoExtractor(object): +class VideoExtractor(BaseExtractor): """\ Extracts a list of video from Article top node """ def __init__(self, config, article): - # article - self.article = article - - # config - self.config = config - - # parser - self.parser = self.config.get_parser() + super(VideoExtractor, self).__init__(config, article) # candidates self.candidates = [] diff --git a/goose/images/image.py b/goose/image.py similarity index 100% rename from goose/images/image.py rename to goose/image.py diff --git a/goose/network.py b/goose/network.py index 0a338a44..666a7d61 100644 --- a/goose/network.py +++ b/goose/network.py @@ -51,7 +51,7 @@ def get_html(self, url): self.result = urllib2.urlopen( self.request, timeout=self.config.http_timeout) - except: + except Exception: self.result = None # read the result content diff --git a/goose/outputformatters.py b/goose/outputformatters.py index ae42457b..1f8ba4bd 100644 --- a/goose/outputformatters.py +++ b/goose/outputformatters.py @@ -47,9 +47,9 @@ def get_language(self): Returns the language is by the article or the configuration language """ - # we don't want to force the target laguage + # we don't want to force the target language # so we use the article.meta_lang - if self.config.use_meta_language == True: + if self.config.use_meta_language: if self.article.meta_lang: return self.article.meta_lang[:2] return self.config.target_language diff --git a/goose/text.py b/goose/text.py index 4008d62b..3ef63d6b 100644 --- a/goose/text.py +++ b/goose/text.py @@ -46,7 +46,7 @@ def encodeValue(value): value = smart_unicode(value) except (UnicodeEncodeError, DjangoUnicodeDecodeError): value = smart_str(value) - except: + except Exception: value = string_org return value @@ -95,7 +95,12 @@ def __init__(self, language='en'): # to generate dynamic path for file to load if not language in self._cached_stop_words: path = os.path.join('text', 'stopwords-%s.txt' % language) - self._cached_stop_words[language] = set(FileHelper.loadResourceFile(path).splitlines()) + try: + content = FileHelper.loadResourceFile(path) + word_list = content.splitlines() + except IOError: + word_list = [] + self._cached_stop_words[language] = set(word_list) self.STOP_WORDS = self._cached_stop_words[language] def remove_punctuation(self, content): diff --git a/goose/utils/__init__.py b/goose/utils/__init__.py index a8be19b5..5a1de7d4 100644 --- a/goose/utils/__init__.py +++ b/goose/utils/__init__.py @@ -105,19 +105,6 @@ def get_parsing_candidate(self, url_to_crawl): return ParsingCandidate(final_url, link_hash) -class StringSplitter(object): - """\ - - """ - def __init__(self, pattern): - self.pattern = re.compile(pattern) - - def split(self, string): - if not string: - return [] - return self.pattern.split(string) - - class StringReplacement(object): def __init__(self, pattern, replaceWith): diff --git a/goose/images/utils.py b/goose/utils/images.py similarity index 90% rename from goose/images/utils.py rename to goose/utils/images.py index 2767416f..388d5c85 100644 --- a/goose/images/utils.py +++ b/goose/utils/images.py @@ -25,20 +25,23 @@ import urllib2 from PIL import Image from goose.utils.encoding import smart_str -from goose.images.image import ImageDetails -from goose.images.image import LocallyStoredImage +from goose.image import ImageDetails +from goose.image import LocallyStoredImage class ImageUtils(object): @classmethod def get_image_dimensions(self, identify_program, path): - image = Image.open(path) image_details = ImageDetails() - image_details.set_mime_type(image.format) - width, height = image.size - image_details.set_width(width) - image_details.set_height(height) + try: + image = Image.open(path) + image_details.set_mime_type(image.format) + width, height = image.size + image_details.set_width(width) + image_details.set_height(height) + except IOError: + image_details.set_mime_type('NA') return image_details @classmethod @@ -116,5 +119,5 @@ def fetch(self, http_client, src): f = urllib2.urlopen(req) data = f.read() return data - except: + except Exception: return None diff --git a/goose/version.py b/goose/version.py index 43693f9c..fedcbb6d 100644 --- a/goose/version.py +++ b/goose/version.py @@ -21,5 +21,5 @@ limitations under the License. """ -version_info = (1, 0, 22) +version_info = (1, 0, 25) __version__ = ".".join(map(str, version_info)) diff --git a/goose/videos/videos.py b/goose/video.py similarity index 100% rename from goose/videos/videos.py rename to goose/video.py diff --git a/goose/videos/__init__.py b/goose/videos/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/setup.py b/setup.py index 2e2b74c0..ebad2547 100644 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ try: with open(os.path.join(os.path.dirname(__file__), 'README.rst')) as f: long_description = f.read() -except: +except Exception: long_description = description setup(name='goose-extractor', diff --git a/tests/base.py b/tests/base.py deleted file mode 100644 index d0619ed1..00000000 --- a/tests/base.py +++ /dev/null @@ -1,104 +0,0 @@ -# -*- coding: utf-8 -*- -"""\ -This is a python port of "Goose" orignialy licensed to Gravity.com -under one or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. - -Python port was written by Xavier Grangier for Recrutae - -Gravity.com licenses this file -to you under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance -with the License. You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import urllib2 -import unittest -import socket - -from StringIO import StringIO - - -# Response -class MockResponse(): - """\ - Base mock response class - """ - code = 200 - msg = "OK" - - def __init__(self, cls): - self.cls = cls - - def content(self): - return "response" - - def response(self, req): - data = self.content(req) - url = req.get_full_url() - resp = urllib2.addinfourl(StringIO(data), data, url) - resp.code = self.code - resp.msg = self.msg - return resp - - -class MockHTTPHandler(urllib2.HTTPHandler, urllib2.HTTPSHandler): - """\ - Mocked HTTPHandler in order to query APIs locally - """ - cls = None - - def https_open(self, req): - return self.http_open(req) - - def http_open(self, req): - r = self.cls.callback(self.cls) - return r.response(req) - - @staticmethod - def patch(cls): - opener = urllib2.build_opener(MockHTTPHandler) - urllib2.install_opener(opener) - # dirty ! - for h in opener.handlers: - if isinstance(h, MockHTTPHandler): - h.cls = cls - return [h for h in opener.handlers if isinstance(h, MockHTTPHandler)][0] - - @staticmethod - def unpatch(): - # urllib2 - urllib2._opener = None - - -class BaseMockTests(unittest.TestCase): - """\ - Base Mock test case - """ - callback = MockResponse - - def setUp(self): - # patch DNS - self.original_getaddrinfo = socket.getaddrinfo - socket.getaddrinfo = self.new_getaddrinfo - MockHTTPHandler.patch(self) - - def tearDown(self): - MockHTTPHandler.unpatch() - # DNS - socket.getaddrinfo = self.original_getaddrinfo - - def new_getaddrinfo(self, *args): - return [(2, 1, 6, '', ('127.0.0.1', 0))] - - def _get_current_testname(self): - return self.id().split('.')[-1:][0] diff --git a/tests/data/extractors/authors/test_author_schema.html b/tests/data/extractors/authors/test_author_schema.html new file mode 100644 index 00000000..da7cfab4 --- /dev/null +++ b/tests/data/extractors/authors/test_author_schema.html @@ -0,0 +1,12 @@ + + + +
+

+ TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

+
+ + diff --git a/tests/data/extractors/authors/test_author_schema.json b/tests/data/extractors/authors/test_author_schema.json new file mode 100644 index 00000000..32185d65 --- /dev/null +++ b/tests/data/extractors/authors/test_author_schema.json @@ -0,0 +1,11 @@ +{ + "url": "http://exemple.com/tweet/", + "expected": { + "authors": [ + "KEVIN SACK", + "ADAM NOSSITER", + "PAM BELLUCK", + "SHERI FINK" + ] + } +} diff --git a/tests/data/extractors/test_allnewlyrics1.html b/tests/data/extractors/content/test_allnewlyrics1.html similarity index 100% rename from tests/data/extractors/test_allnewlyrics1.html rename to tests/data/extractors/content/test_allnewlyrics1.html diff --git a/tests/data/extractors/test_allnewlyrics1.json b/tests/data/extractors/content/test_allnewlyrics1.json similarity index 90% rename from tests/data/extractors/test_allnewlyrics1.json rename to tests/data/extractors/content/test_allnewlyrics1.json index 4f8e8cc1..53cd1cf8 100644 --- a/tests/data/extractors/test_allnewlyrics1.json +++ b/tests/data/extractors/content/test_allnewlyrics1.json @@ -10,8 +10,8 @@ "PJ Morton", "Stevie Wonder" ], - "title": "PJ Morton (Ft. Stevie Wonder)", + "title": "\u201cOnly One\u201d Lyrics : PJ Morton (Ft. Stevie Wonder)", "meta_favicon": "", "meta_lang": "en" } -} \ No newline at end of file +} diff --git a/tests/data/extractors/test_aolNews.html b/tests/data/extractors/content/test_aolNews.html similarity index 100% rename from tests/data/extractors/test_aolNews.html rename to tests/data/extractors/content/test_aolNews.html diff --git a/tests/data/extractors/test_aolNews.json b/tests/data/extractors/content/test_aolNews.json similarity index 100% rename from tests/data/extractors/test_aolNews.json rename to tests/data/extractors/content/test_aolNews.json diff --git a/tests/data/extractors/content/test_articlebody_attribute.html b/tests/data/extractors/content/test_articlebody_attribute.html new file mode 100644 index 00000000..bbf00f65 --- /dev/null +++ b/tests/data/extractors/content/test_articlebody_attribute.html @@ -0,0 +1,15 @@ + + +
+

+ Not an Actual Content + TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

+
+
+

+ Search-and-rescue teams were mobilized from across Southeast Asia on Sunday after a commercial airliner with 162 people on board lost contact with ground controllers off the coast of Borneo, a search effort that evoked a distressingly familiar mix of grief and mystery nine months after a Malaysia Airlines jetliner disappeared over the Indian Ocean. +

+
+ + diff --git a/tests/data/extractors/content/test_articlebody_attribute.json b/tests/data/extractors/content/test_articlebody_attribute.json new file mode 100644 index 00000000..7fbebcaf --- /dev/null +++ b/tests/data/extractors/content/test_articlebody_attribute.json @@ -0,0 +1,6 @@ +{ + "url": "http://exemple.com/test_opengraphcontent", + "expected": { + "cleaned_text": "Search-and-rescue teams were mobilized " + } +} diff --git a/tests/data/extractors/content/test_articlebody_itemprop.html b/tests/data/extractors/content/test_articlebody_itemprop.html new file mode 100644 index 00000000..46e5c9de --- /dev/null +++ b/tests/data/extractors/content/test_articlebody_itemprop.html @@ -0,0 +1,15 @@ + + +
+

+ Not an Actual Content + TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

+
+
+

+ Search-and-rescue teams were mobilized from across Southeast Asia on Sunday after a commercial airliner with 162 people on board lost contact with ground controllers off the coast of Borneo, a search effort that evoked a distressingly familiar mix of grief and mystery nine months after a Malaysia Airlines jetliner disappeared over the Indian Ocean. +

+
+ + diff --git a/tests/data/extractors/content/test_articlebody_itemprop.json b/tests/data/extractors/content/test_articlebody_itemprop.json new file mode 100644 index 00000000..7fbebcaf --- /dev/null +++ b/tests/data/extractors/content/test_articlebody_itemprop.json @@ -0,0 +1,6 @@ +{ + "url": "http://exemple.com/test_opengraphcontent", + "expected": { + "cleaned_text": "Search-and-rescue teams were mobilized " + } +} diff --git a/tests/data/extractors/content/test_articlebody_tag.html b/tests/data/extractors/content/test_articlebody_tag.html new file mode 100644 index 00000000..6e7ca4be --- /dev/null +++ b/tests/data/extractors/content/test_articlebody_tag.html @@ -0,0 +1,15 @@ + + +
+

+ Not an Actual Content + TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

+
+
+

+ Search-and-rescue teams were mobilized from across Southeast Asia on Sunday after a commercial airliner with 162 people on board lost contact with ground controllers off the coast of Borneo, a search effort that evoked a distressingly familiar mix of grief and mystery nine months after a Malaysia Airlines jetliner disappeared over the Indian Ocean. +

+
+ + diff --git a/tests/data/extractors/content/test_articlebody_tag.json b/tests/data/extractors/content/test_articlebody_tag.json new file mode 100644 index 00000000..7fbebcaf --- /dev/null +++ b/tests/data/extractors/content/test_articlebody_tag.json @@ -0,0 +1,6 @@ +{ + "url": "http://exemple.com/test_opengraphcontent", + "expected": { + "cleaned_text": "Search-and-rescue teams were mobilized " + } +} diff --git a/tests/data/extractors/test_bbc_chinese.html b/tests/data/extractors/content/test_bbc_chinese.html similarity index 100% rename from tests/data/extractors/test_bbc_chinese.html rename to tests/data/extractors/content/test_bbc_chinese.html diff --git a/tests/data/extractors/test_bbc_chinese.json b/tests/data/extractors/content/test_bbc_chinese.json similarity index 100% rename from tests/data/extractors/test_bbc_chinese.json rename to tests/data/extractors/content/test_bbc_chinese.json diff --git a/tests/data/extractors/test_businessWeek1.html b/tests/data/extractors/content/test_businessWeek1.html similarity index 100% rename from tests/data/extractors/test_businessWeek1.html rename to tests/data/extractors/content/test_businessWeek1.html diff --git a/tests/data/extractors/test_businessWeek1.json b/tests/data/extractors/content/test_businessWeek1.json similarity index 100% rename from tests/data/extractors/test_businessWeek1.json rename to tests/data/extractors/content/test_businessWeek1.json diff --git a/tests/data/extractors/test_businessWeek2.html b/tests/data/extractors/content/test_businessWeek2.html similarity index 100% rename from tests/data/extractors/test_businessWeek2.html rename to tests/data/extractors/content/test_businessWeek2.html diff --git a/tests/data/extractors/test_businessWeek2.json b/tests/data/extractors/content/test_businessWeek2.json similarity index 100% rename from tests/data/extractors/test_businessWeek2.json rename to tests/data/extractors/content/test_businessWeek2.json diff --git a/tests/data/extractors/test_businessWeek3.html b/tests/data/extractors/content/test_businessWeek3.html similarity index 100% rename from tests/data/extractors/test_businessWeek3.html rename to tests/data/extractors/content/test_businessWeek3.html diff --git a/tests/data/extractors/test_businessWeek3.json b/tests/data/extractors/content/test_businessWeek3.json similarity index 100% rename from tests/data/extractors/test_businessWeek3.json rename to tests/data/extractors/content/test_businessWeek3.json diff --git a/tests/data/extractors/test_businessinsider3.html b/tests/data/extractors/content/test_businessinsider3.html similarity index 100% rename from tests/data/extractors/test_businessinsider3.html rename to tests/data/extractors/content/test_businessinsider3.html diff --git a/tests/data/extractors/test_businessinsider3.json b/tests/data/extractors/content/test_businessinsider3.json similarity index 100% rename from tests/data/extractors/test_businessinsider3.json rename to tests/data/extractors/content/test_businessinsider3.json diff --git a/tests/data/extractors/test_cbslocal.html b/tests/data/extractors/content/test_cbslocal.html similarity index 100% rename from tests/data/extractors/test_cbslocal.html rename to tests/data/extractors/content/test_cbslocal.html diff --git a/tests/data/extractors/test_cbslocal.json b/tests/data/extractors/content/test_cbslocal.json similarity index 100% rename from tests/data/extractors/test_cbslocal.json rename to tests/data/extractors/content/test_cbslocal.json diff --git a/tests/data/extractors/test_cnbc1.html b/tests/data/extractors/content/test_cnbc1.html similarity index 100% rename from tests/data/extractors/test_cnbc1.html rename to tests/data/extractors/content/test_cnbc1.html diff --git a/tests/data/extractors/test_cnbc1.json b/tests/data/extractors/content/test_cnbc1.json similarity index 100% rename from tests/data/extractors/test_cnbc1.json rename to tests/data/extractors/content/test_cnbc1.json diff --git a/tests/data/extractors/test_cnet.html b/tests/data/extractors/content/test_cnet.html similarity index 100% rename from tests/data/extractors/test_cnet.html rename to tests/data/extractors/content/test_cnet.html diff --git a/tests/data/extractors/test_cnet.json b/tests/data/extractors/content/test_cnet.json similarity index 100% rename from tests/data/extractors/test_cnet.json rename to tests/data/extractors/content/test_cnet.json diff --git a/tests/data/extractors/test_cnn1.html b/tests/data/extractors/content/test_cnn1.html similarity index 100% rename from tests/data/extractors/test_cnn1.html rename to tests/data/extractors/content/test_cnn1.html diff --git a/tests/data/extractors/test_cnn1.json b/tests/data/extractors/content/test_cnn1.json similarity index 96% rename from tests/data/extractors/test_cnn1.json rename to tests/data/extractors/content/test_cnn1.json index b847add0..ced9eb91 100644 --- a/tests/data/extractors/test_cnn1.json +++ b/tests/data/extractors/content/test_cnn1.json @@ -6,8 +6,8 @@ "final_url": "http://www.cnn.com/2010/POLITICS/08/13/democrats.social.security/index.html", "meta_keywords": "", "cleaned_text": "Washington (CNN) -- Democrats pledged ", - "title": "Democrats to use Social Security against GOP this fall", + "title": "Democrats to use Social Security against GOP this fall - CNN.com", "meta_favicon": "http://i.cdn.turner.com/cnn/.element/img/3.0/global/misc/apple-touch-icon.png", "meta_lang": "en" } -} \ No newline at end of file +} diff --git a/tests/data/extractors/test_cnn_arabic.html b/tests/data/extractors/content/test_cnn_arabic.html similarity index 100% rename from tests/data/extractors/test_cnn_arabic.html rename to tests/data/extractors/content/test_cnn_arabic.html diff --git a/tests/data/extractors/test_cnn_arabic.json b/tests/data/extractors/content/test_cnn_arabic.json similarity index 100% rename from tests/data/extractors/test_cnn_arabic.json rename to tests/data/extractors/content/test_cnn_arabic.json diff --git a/tests/data/extractors/test_donga_korean.html b/tests/data/extractors/content/test_donga_korean.html similarity index 100% rename from tests/data/extractors/test_donga_korean.html rename to tests/data/extractors/content/test_donga_korean.html diff --git a/tests/data/extractors/test_donga_korean.json b/tests/data/extractors/content/test_donga_korean.json similarity index 100% rename from tests/data/extractors/test_donga_korean.json rename to tests/data/extractors/content/test_donga_korean.json diff --git a/tests/data/extractors/test_elmondo1.html b/tests/data/extractors/content/test_elmondo1.html similarity index 100% rename from tests/data/extractors/test_elmondo1.html rename to tests/data/extractors/content/test_elmondo1.html diff --git a/tests/data/extractors/test_elmondo1.json b/tests/data/extractors/content/test_elmondo1.json similarity index 100% rename from tests/data/extractors/test_elmondo1.json rename to tests/data/extractors/content/test_elmondo1.json diff --git a/tests/data/extractors/test_elpais.html b/tests/data/extractors/content/test_elpais.html similarity index 100% rename from tests/data/extractors/test_elpais.html rename to tests/data/extractors/content/test_elpais.html diff --git a/tests/data/extractors/test_elpais.json b/tests/data/extractors/content/test_elpais.json similarity index 100% rename from tests/data/extractors/test_elpais.json rename to tests/data/extractors/content/test_elpais.json diff --git a/tests/data/extractors/test_engadget.html b/tests/data/extractors/content/test_engadget.html similarity index 100% rename from tests/data/extractors/test_engadget.html rename to tests/data/extractors/content/test_engadget.html diff --git a/tests/data/extractors/test_engadget.json b/tests/data/extractors/content/test_engadget.json similarity index 100% rename from tests/data/extractors/test_engadget.json rename to tests/data/extractors/content/test_engadget.json diff --git a/tests/data/extractors/test_espn.html b/tests/data/extractors/content/test_espn.html similarity index 100% rename from tests/data/extractors/test_espn.html rename to tests/data/extractors/content/test_espn.html diff --git a/tests/data/extractors/test_espn.json b/tests/data/extractors/content/test_espn.json similarity index 100% rename from tests/data/extractors/test_espn.json rename to tests/data/extractors/content/test_espn.json diff --git a/tests/data/extractors/test_foxNews.html b/tests/data/extractors/content/test_foxNews.html similarity index 100% rename from tests/data/extractors/test_foxNews.html rename to tests/data/extractors/content/test_foxNews.html diff --git a/tests/data/extractors/test_foxNews.json b/tests/data/extractors/content/test_foxNews.json similarity index 100% rename from tests/data/extractors/test_foxNews.json rename to tests/data/extractors/content/test_foxNews.json diff --git a/tests/data/extractors/test_get_canonical_url.html b/tests/data/extractors/content/test_get_canonical_url.html similarity index 100% rename from tests/data/extractors/test_get_canonical_url.html rename to tests/data/extractors/content/test_get_canonical_url.html diff --git a/tests/data/extractors/test_get_canonical_url.json b/tests/data/extractors/content/test_get_canonical_url.json similarity index 100% rename from tests/data/extractors/test_get_canonical_url.json rename to tests/data/extractors/content/test_get_canonical_url.json diff --git a/tests/data/extractors/test_gizmodo1.html b/tests/data/extractors/content/test_gizmodo1.html similarity index 100% rename from tests/data/extractors/test_gizmodo1.html rename to tests/data/extractors/content/test_gizmodo1.html diff --git a/tests/data/extractors/test_gizmodo1.json b/tests/data/extractors/content/test_gizmodo1.json similarity index 100% rename from tests/data/extractors/test_gizmodo1.json rename to tests/data/extractors/content/test_gizmodo1.json diff --git a/tests/data/extractors/test_guardian1.html b/tests/data/extractors/content/test_guardian1.html similarity index 100% rename from tests/data/extractors/test_guardian1.html rename to tests/data/extractors/content/test_guardian1.html diff --git a/tests/data/extractors/test_guardian1.json b/tests/data/extractors/content/test_guardian1.json similarity index 100% rename from tests/data/extractors/test_guardian1.json rename to tests/data/extractors/content/test_guardian1.json diff --git a/tests/data/extractors/test_huffingtonPost2.html b/tests/data/extractors/content/test_huffingtonPost2.html similarity index 100% rename from tests/data/extractors/test_huffingtonPost2.html rename to tests/data/extractors/content/test_huffingtonPost2.html diff --git a/tests/data/extractors/test_huffingtonPost2.json b/tests/data/extractors/content/test_huffingtonPost2.json similarity index 100% rename from tests/data/extractors/test_huffingtonPost2.json rename to tests/data/extractors/content/test_huffingtonPost2.json diff --git a/tests/data/extractors/content/test_issue115.html b/tests/data/extractors/content/test_issue115.html new file mode 100644 index 00000000..0b968cfc --- /dev/null +++ b/tests/data/extractors/content/test_issue115.html @@ -0,0 +1,1740 @@ + + + + + + + + + + + + + + Jessica Livingston: Why Startups Need to Focus on Sales, Not Marketing - The Accelerators - WSJ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+
+
+ + + + +
+ + + +
+ +
+ + + + + +
+ + + +
+ + +
+ +
+
+
+ +
+ + + + +
+ +
+
+ + + + +
+ + + 1:26 pm ET
Jun 3, 2014
+

+ Sales/Marketing

+ +

+ Jessica Livingston: Why Startups Need to Focus on Sales, Not Marketing

+ +
+ + + +
+ +
+ + + + + + + + + + + + + + +
+
+ + + +
+
+
+
+
+
+ + + + + + + +
+
    +
    +
    +
    +
    +

    JESSICA LIVINGSTON: The most important thing an early-stage startup should know about marketing is rather counterintuitive: that you probably shouldn’t be doing anything you’d use the term “marketing” to describe. Sales and marketing are two ends of a continuum. At the sales end your outreach is narrow and deep. At the marketing end it is broad and shallow. And for an early stage startup, narrow and deep is what you want — not just in the way you appeal to users, but in the type of product you build. Which means the kind of marketing you should be doing should be indistinguishable from sales: you should be talking to a small number of users who are seriously interested in what you’re making, not a broad audience who are on the whole indifferent.

    +

    Successful startups almost always start narrow and deep. Apple started with a computer Steve Wozniak made to impress his friends at the Homebrew Computer Club. There weren’t a lot of them, but they were really interested. Facebook started out just for Harvard University students. Again, not a lot of potential users, but they really wanted it. Successful startups start narrow and deep partly because they don’t have the power to reach a big audience, so they have to choose a very interested one. But also because the product is still being defined. The conversation with initial users is also market research.

    + + + + + + +
    +
    +
    +
    +
    +
    +

    See what other startup mentors have to say about marketing tactics.

    +
    +

    At Y Combinator, we advise most startups to begin by seeking out some core group of early adopters and then engaging with individual users to convince them to sign up.

    +

    For example, the early adopters of Airbnb were hosts and guests in New York City (Y Combinator funded Airbnb in Winter of 2009). To grow, Airbnb needed to get more hosts and also help existing hosts convert better. So Brian Chesky and Joe Gebbia flew to New York every week to meet with hosts — teaching them how to price their listings, take better photos, and so on. They also asked hosts for introductions to potential new hosts, who they then met in person.

    +

    Stripe (YC S09) was particularly aggressive about signing up users manually at first. The YC alumni network are a good source of early adopters for a service like Stripe. Co-founders Patrick and John Collison worked their way methodically through it, and when someone agreed to try Stripe, the brothers would install it for them on the spot rather than email a link. We now call their technique “Collison installation.”

    +

    Many guest speakers at Y Combinator offer stories about how manual the initial process of getting users was. Pinterest is a mass consumer product, but Ben Silbermann said even he began by recruiting users manually. Ben would literally walk into cafes in Palo Alto and ask random people to try out Pinterest while he gathered feedback over their shoulders.

    +

    The danger of the term “marketing” is that it implies the opposite end of the sales/marketing spectrum from the one startups should be focusing on. And just as focusing on the right end has a double benefit — you acquire users and define the product — focusing on the wrong end is doubly dangerous, because you not only fail to grow, but you can remain in denial about your product’s lameness.

    +

    All too often, I’ve seen founders build some initially mediocre product, announce it to the world, find that users never show up, and not know what to do next. As well as not getting any users, the startup never gets the feedback it needs to improve the product.

    +

    So why wouldn’t all founders start by engaging with users individually? Because it’s hard and demoralizing. Sales gives you a kind of harsh feedback that “marketing” doesn’t. You try to convince someone to use what you’ve built, and they won’t. These conversations are painful, but necessary. I suspect from my experience that founders who want to remain in denial about the inadequacy of their product and/or the difficulty of starting a startup subconsciously prefer the broad and shallow “marketing” approach precisely because they can’t face the work and unpleasant truths they’ll find if they talk to users.

    +

    How should you measure if your manual efforts are effective? Focus on growth rate rather than absolute numbers. Then you won’t be dismayed if the absolute numbers are small at first. If you have 20 users, you only need two more this week to grow 10%. And while two users is a small number for most products, 10% a week is a great growth rate. If you keep growing at 10% a week, the absolute numbers will eventually become impressive.

    +

    Our advice at Y Combinator is always to make a really good product and go out and get users manually. The two work hand-in-hand: you need to talk individually to early adopters to make a really good product. So focusing on the narrow and deep end of the sales/marketing continuum is not just the most effective way to get users. Your startup will die if you don’t.

    +


    +

    +


    +

    + +
    + + + + +
    + + + + + + + + + + + + +
    The Accelerators HOME PAGE
    + +

    Add a Comment

    We welcome thoughtful comments from readers. Please comply with our guidelines. Our blogs do not require the use of your real name.

    Comments (5 of 23)

    View all Comments »
      • I feel that Levinson (Guerrilla Marketing) defines marketing in the most simple and true way -- anything you do to help sell a product or service. These two terms are connected at the hip...sales do not take place without effective marketing no matter what you choose to use to communicate messages. A face-to-face with early adopters, reaching out through influencers (thank you Malcolm Gladwell), or effective networking within a narrow group can all be considered marketing. Just as much as direct mail, television commercials or an Adwords campaign. You can't just go out and sell without making sure your message is clear (and differentiates you). And you can't go out with a clear message without determining what communication method will work most effectively with your target audience. In the end, we are all marketers and salespeople, and where we are on the marketing - sales continuum is determined by the moment at which you want to ask for someone to buy you/your product. Even though methodology might evolve and change what doesn't change is the need to make sure your message breaks through the marketing clutter so that the intended target says yes when you ask them to buy what you're marketing to them.

        +
      • This article and the associated comments pretty much sums up the primary problem with "marketing", that people have varying definitions. Product management concepts (4P's), outbound campaign concepts (for brand, benefits, offers, promotions, positioning, etc.), inbound (market research, iterative development, etc.), and marketing communications (internal and external, PR). All of these discrete concepts are valid and essential in running a business, and have different focus/priority depending on the stage of the company.

        +

        That said, having been involved in successful and unsuccessful startup, the point of the article is valid, that good analysis of product adoption by early users is essential to the success of a product and more important than activities that drive broad, mass awareness. This feedback loop is an extension of the iterative feedback loop found in most agile development activities, the users are just providing another feedback point, namely the willingness to spend their dollars.

        +

        While I've encountered entrepreneurs that hesitate to engage early users, these are typically the same folks that don't respond well when you tell them certain aspects of the product need to be changed during the development process. It's just their belief that they know best. Unlike Steve Jobs et al, they haven't done their time with users.

        +

        One proposal for future "marketing" articles is to clearly identify the aspect of "marketing" to be discussed. Much like Operations and Product Development, Marketing encompasses many disciplines. Unfortunately, a few of the disciplines call themselves Marketing in various organizations.

        +

        Cheers.

        +
      • It appears times have changed and not for the better for the small business owners and start-up entrepreneurs. Sales cannot be generated without marketing and the small owner, not the larger business owner has no consumption of free offers to enhance their business. I had a teaching school for 31 years with business education and trained some 250,000 business owners at a 90% success rating. Coming back from a 10 years retiring stay I have tried to re-start the same program with additional educational program and new concepts, It's been some 10 months and not a nibble on one of the best educational program ever offered into the small business world. The marketing has not gathered one cent in sales and after offering more than $500.00 per person in free services, and materials, and much more. More than 6000 contacts 150 programs and within those contacts more than 2000 personal face to face presentations, and reviewed materials. It seems I cannot reach the small business owner to make them a better owner and a more profitable and saleable business. I have created new programs, insurance guarantees, seminars guarantees, free promotional advertising, on air free promotions, and so many other business programs.

        +

        I don't know to agree, or disagree but the fact remains that the marketing is not the same as it was going back to when, in 1985 when I started. When you offer sound and sounder business program for free and no one asked for the program I am at the end of my rope and some 50 years of experience and $35 million in sales in 1989, WHO CAN OFFER THE CORRECT ANSWER TO REACH THE SMALL BUSINESS OWNER.. Thanks Tony Pezza

        +
      • It doesn't appear you understand or appreciate the definition of marketing. There are at least four essential components of marketing (some argue there are as many as six), each with considerable depth and no more important than the other components: The product itself, the place/distribution channel through which you will sell the product, the price of the product and the way in which you'll position/promote the product to drive sales.

        +

        Andrew Shea
        +Senior Marketing Executive
        +St. Louis, Missouri

        +
      • I'm not a marketing/sales person, so I won't go there. But, having founded my consulting firm three years ago, one of the most important lessons I've learned is investors and lenders are most impressed by revenue. And, one generates revenue by making sales. If your company can demonstrate that it can create yield on a small budget investors and lenders will be (1) taken with management (a critical threshold) and (2) more inclined to invest or lend funds.

        +
    + + + +
    + + + + + + +
    +
    +
    + + + +
    +
    + +
    +
    +

    About The Accelerators

    + + + +
    +
      +
    • +

      For aspiring or actual entrepreneurs, The Accelerators forum is a lively discussion among startup mentors– entrepreneurs, angel investors and venture capitalists. To reach us: @wsjstartup or theaccelerators@wsj.com.

      +
        +
      • +
        The Accelerators on Facebook +
      • +
      +
    • +
    +
    +
    +

    The Accelerators

    +
    + + + + + + + + + +
    + + + + + + + + + +
    + + + + + +
    +
    + + Liz
    Lange
    +
    +
    + + + +
    + + + + + + + + + +
    + + + + + + + +
    +
    + + Mike
    Walsh
    +
    +
    + +
    + + + + + + + + + +
    + +
    +
    + + +
    + + + +
    + + + + + + + +
    + + +
    + + + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/data/extractors/content/test_issue115.json b/tests/data/extractors/content/test_issue115.json new file mode 100644 index 00000000..8f87a9bc --- /dev/null +++ b/tests/data/extractors/content/test_issue115.json @@ -0,0 +1,6 @@ +{ + "url": "http://blogs.wsj.com/accelerators/2014/06/03/jessica-livingston-why-startups-need-to-focus-on-sales-not-marketing/", + "expected": { + "cleaned_text": "JESSICA LIVINGSTON: The most important thing an early-stage startup should know about marketing is rather counterintuitive: that you probably shouldn’t be doing anything you’d use the term" + } +} diff --git a/tests/data/extractors/content/test_issue129.html b/tests/data/extractors/content/test_issue129.html new file mode 100644 index 00000000..9f523cbc --- /dev/null +++ b/tests/data/extractors/content/test_issue129.html @@ -0,0 +1,1460 @@ + + + + + + + + + + + + + + + + + +Lost in JIT: PyPy and the road towards SciPy + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    + + +
    + +

    Thursday, October 27, 2011

    + +
    + +
    +
    + + + +

    +PyPy and the road towards SciPy +

    +
    +
    +
    +
    +

    Hello


    Recent PyPys effort to bring NumPy and the associated fundraiser
    caused a lot of discussion in the SciPy community regarding PyPy, NumPy,
    SciPy and the future of numeric computing in Python.


    There were discussions on the topic as well as various blog posts
    from the SciPy community who addressed few issues. It seems there was a lot
    of talking past each other and I would like to clarify on a few points here,
    although this should be taken as my personal opinion on the subject.


    So, let's start from the beginning. There are no plans for PyPy to
    reimplement everything that's out there in RPython. That has been pointed
    out from the beginning as a fallacy of our approach -- we simply don't plan
    to do that. We agree that Python is a great glue language and we would like
    to keep it that way. PyPy can nicely interface with C using ctypes with
    a slightly worse story for C++ (even though there were experiments).
    What we know by now is that CPython C API is not a very good glue for PyPy,
    it's too tied to CPython and it prevents a lot of interesting optimizations
    from happening. The contenders are a few with Cython being a favorite
    for now, however for Cython to be usable we need to have a story for C++
    (I know Cython does have a story but it's unclear how that would work with
    the PyPy backend).


    Which brings me to second point that while a lot of code in packages like
    SciPy or matplotlib should be reusable in PyPy, it's probably not in
    the current form. Either a lot of it has to move to Cython or some other
    way of interfacing with C will come across. This should make it clear that
    we want to interface with SciPy and reuse as much as possible.


    Another recurring topic that seems to pop up is why we just don't reuse Cython
    for NumPy instead of reimplementing everything. The problem is that we need
    a robust array type with all the interface before we can start using Cython
    for anything. Since we're going to implement it anyway, why not go all the way
    and implement the full NumPy module? And that is the topic of the current
    funding proposal is exactly that -- to provide full NumPy module. That
    would be a very good start for integrating the full stack of SciPy and
    matplotlib and all other libraries out there.


    But also the trick is that a robust array module can go a long way alone.
    It allows you to prototype a lot of algorithms on it's own and generally has
    it's uses, without having to worry "but if I read all the elements from the
    array it's going to be dog slow".


    The last accusation is that we're trying to split the community. The answer is
    simply no. We have a relatively good roadmap how to get to support what's out
    there in scientific community and ideally support all people out there. This
    will however take some time and the group of people that can run their
    stuff on top of PyPy will be growing over time. This is indeed precisely what
    is happening in other areas of python world -- more and more stuff run on PyPy
    and people find it more and more interesting to try and to adapt their
    own stuff to run.


    To summarize, I don't really think there is that much of a gap between us
    and SciPy people. We'll start small (by providing full NumPy implementation)
    and then gradually move forward reusing as much as possible from the entire
    stack.


    Cheers,
    fijal

    +
    +
    + +
    +
    + +

    7 comments:

    +
    + + +
    +
    1. I'm going to play devil's advocate and ask the question of why PyPy should care one bit about the existing Numpy implementation or supporting C++ right now. I think it would be cool if the PyPy folks simply built the array type that *they* want. Make it fast. Do every kind of crazy optimization you can think of with it. Sure, call it something other than numpy to start, but make it something that programmers who want to live on the bleeding edge can play around it and try out (I know I'd be interested in messing around with something like that). Providing full numpy compatibility and all of that can come later on after more experience has been gained.

      ReplyDelete
    2. Hi Dave.

      If you download PyPy nightly, you can play with numpy.array that does exactly this. We're working on adding features (like multi dimensional arrays) and simply numpy API is kind of good.

      ReplyDelete
    3. The numpy interface is battle-tested over many years of use, and is pretty flexible. I am usually pleasantly surprised when applying it to new problems.

      Given the effort required to integrate a multidimensional array type into PyPy, I don't think it makes sense to try to reinvent the wheel by designing a completely new API. I could see someone experimenting with the API after a numpy-derived core is in place.

      ReplyDelete
    4. You can write "full" in bold, but that doesn't make it so. It should be clear to you by now that by claiming to provide a full numpy implementation you are at the very least confusing the issue for many users. To spell it out once more, here is what numpy provides and what you plan to implement:

      - Python API; ndarray, dtypes (yes)
      - C API; ndarray, dtypes (no)
      - Fourier transforms (no - I think)
      - Linear algebra (no - I think)
      - Random number generators (yes - I think)

      Furthermore, several people (Travis, David, Ian, Dave Beazley above) mentioned you shouldn't call your implementation numpy. Before you were using micronumpy, that makes a lot more sense.

      ReplyDelete
    5. When I mean full, I mean full. It's all yes in your table except the C API. The way to call C using those arrays will be provided, but not using the CPython C API.

      We'll rename it to numpypy for time being (at least until it's reasonably complete).

      ReplyDelete
    6. I'm not quite sure why people are getting so fussed about it. Most of the work in SciPy is in the C code, and it will still be easy to point some algorithm written in C at the memory held by the new PyPy arrays as it is in the current numpy.

      Why would people use PyPy for science if it's implementation of numpy was slower than CPythons one? They wouldn't, so that's why PyPy can't expose the existing CPython C API, as simply the act of exposing that API would make it much slower, due to the overhead of simulating ref-counting etc. There's no point PyPy trying to make a numpy implementation that exposes the CPython C API.

      ReplyDelete
    7. I think that linear algebra and Fourier transforms are frequently needed.
      Come on guys, lets donate:
      http://pypy.org/numpydonate.html

      ReplyDelete
    +
    +
    +
    + +

    +

    + + + + +
    +

    + +
    +
    + +
    + + +
    +
    + +Newer Post + + +Older Post + +Home +
    +
    +
    + +
    + +
    +
    +
    +
    +
    + +
    +
    +
    +
    + +
    +
    +
    +
    + +
    + +
    +
    +
    +
    +
    +
    +
    +
    + +
    + +
    +
    +
    +
    +
    +
    +
    +
    + + + + + + diff --git a/tests/data/extractors/content/test_issue129.json b/tests/data/extractors/content/test_issue129.json new file mode 100644 index 00000000..ddf6cbc8 --- /dev/null +++ b/tests/data/extractors/content/test_issue129.json @@ -0,0 +1,6 @@ +{ + "url": "http://lostinjit.blogspot.fr/2011/10/pypy-and-road-towards-scipy.html", + "expected": { + "cleaned_text": "Recent PyPys effort to bring NumPy and the associated fundraiser" + } +} diff --git a/tests/data/extractors/test_issue24.html b/tests/data/extractors/content/test_issue24.html similarity index 100% rename from tests/data/extractors/test_issue24.html rename to tests/data/extractors/content/test_issue24.html diff --git a/tests/data/extractors/test_issue24.json b/tests/data/extractors/content/test_issue24.json similarity index 100% rename from tests/data/extractors/test_issue24.json rename to tests/data/extractors/content/test_issue24.json diff --git a/tests/data/extractors/test_issue25.html b/tests/data/extractors/content/test_issue25.html similarity index 100% rename from tests/data/extractors/test_issue25.html rename to tests/data/extractors/content/test_issue25.html diff --git a/tests/data/extractors/test_issue25.json b/tests/data/extractors/content/test_issue25.json similarity index 100% rename from tests/data/extractors/test_issue25.json rename to tests/data/extractors/content/test_issue25.json diff --git a/tests/data/extractors/test_issue28.html b/tests/data/extractors/content/test_issue28.html similarity index 100% rename from tests/data/extractors/test_issue28.html rename to tests/data/extractors/content/test_issue28.html diff --git a/tests/data/extractors/test_issue28.json b/tests/data/extractors/content/test_issue28.json similarity index 100% rename from tests/data/extractors/test_issue28.json rename to tests/data/extractors/content/test_issue28.json diff --git a/tests/data/extractors/test_issue32.html b/tests/data/extractors/content/test_issue32.html similarity index 100% rename from tests/data/extractors/test_issue32.html rename to tests/data/extractors/content/test_issue32.html diff --git a/tests/data/extractors/test_issue32.json b/tests/data/extractors/content/test_issue32.json similarity index 100% rename from tests/data/extractors/test_issue32.json rename to tests/data/extractors/content/test_issue32.json diff --git a/tests/data/extractors/test_issue4.html b/tests/data/extractors/content/test_issue4.html similarity index 100% rename from tests/data/extractors/test_issue4.html rename to tests/data/extractors/content/test_issue4.html diff --git a/tests/data/extractors/test_issue4.json b/tests/data/extractors/content/test_issue4.json similarity index 100% rename from tests/data/extractors/test_issue4.json rename to tests/data/extractors/content/test_issue4.json diff --git a/tests/data/extractors/test_lefigaro.html b/tests/data/extractors/content/test_lefigaro.html similarity index 100% rename from tests/data/extractors/test_lefigaro.html rename to tests/data/extractors/content/test_lefigaro.html diff --git a/tests/data/extractors/test_lefigaro.json b/tests/data/extractors/content/test_lefigaro.json similarity index 91% rename from tests/data/extractors/test_lefigaro.json rename to tests/data/extractors/content/test_lefigaro.json index 311f4455..19f655ec 100644 --- a/tests/data/extractors/test_lefigaro.json +++ b/tests/data/extractors/content/test_lefigaro.json @@ -6,7 +6,7 @@ "domain": "www.lefigaro.fr", "final_url": "http://www.lefigaro.fr/conjoncture/2013/04/05/20002-20130405ARTFIG00473-montebourg-envisage-des-privatisations-partielles.php", "meta_keywords": "Actualit\u00e9 \u00e9conomique, entreprises, \u00e9conomie, bourse, emploi, imp\u00f4ts, cac 40, creation d'entreprise, chef d'entreprise, grands patrons, consommation, multinationales, privatisation, d\u00e9localisations, concurrence, monopole, crise, bourse, licenciements, union europ\u00e9enne, etats-unis, chine, pmi, pme, tpe, salaires, relance, pib, pnb, aides sociales, japon, r\u00e9cession, \u00e9conomie verte, fmi, reprise, croissance, news, actu", - "cleaned_text": "Selon le ministre du Redressement productif interview\u00e9 par le Wall Street Journal, le gouvernement", + "cleaned_text": "«Dans le cadre de l'effort de restructuration budgétaire", "tags": [ "EDF", "Privatisation", @@ -19,4 +19,4 @@ "meta_favicon": "http://www.lefigaro.fr/icones/favicon.ico", "meta_lang": null } -} \ No newline at end of file +} diff --git a/tests/data/extractors/test_liberation.html b/tests/data/extractors/content/test_liberation.html similarity index 100% rename from tests/data/extractors/test_liberation.html rename to tests/data/extractors/content/test_liberation.html diff --git a/tests/data/extractors/test_liberation.json b/tests/data/extractors/content/test_liberation.json similarity index 100% rename from tests/data/extractors/test_liberation.json rename to tests/data/extractors/content/test_liberation.json diff --git a/tests/data/extractors/test_marketplace.html b/tests/data/extractors/content/test_marketplace.html similarity index 100% rename from tests/data/extractors/test_marketplace.html rename to tests/data/extractors/content/test_marketplace.html diff --git a/tests/data/extractors/test_marketplace.json b/tests/data/extractors/content/test_marketplace.json similarity index 100% rename from tests/data/extractors/test_marketplace.json rename to tests/data/extractors/content/test_marketplace.json diff --git a/tests/data/extractors/test_mashable_issue_74.html b/tests/data/extractors/content/test_mashable_issue_74.html similarity index 100% rename from tests/data/extractors/test_mashable_issue_74.html rename to tests/data/extractors/content/test_mashable_issue_74.html diff --git a/tests/data/extractors/test_mashable_issue_74.json b/tests/data/extractors/content/test_mashable_issue_74.json similarity index 100% rename from tests/data/extractors/test_mashable_issue_74.json rename to tests/data/extractors/content/test_mashable_issue_74.json diff --git a/tests/data/extractors/test_msn1.html b/tests/data/extractors/content/test_msn1.html similarity index 100% rename from tests/data/extractors/test_msn1.html rename to tests/data/extractors/content/test_msn1.html diff --git a/tests/data/extractors/test_msn1.json b/tests/data/extractors/content/test_msn1.json similarity index 100% rename from tests/data/extractors/test_msn1.json rename to tests/data/extractors/content/test_msn1.json diff --git a/tests/data/extractors/test_okaymarketing.html b/tests/data/extractors/content/test_okaymarketing.html similarity index 100% rename from tests/data/extractors/test_okaymarketing.html rename to tests/data/extractors/content/test_okaymarketing.html diff --git a/tests/data/extractors/test_okaymarketing.json b/tests/data/extractors/content/test_okaymarketing.json similarity index 100% rename from tests/data/extractors/test_okaymarketing.json rename to tests/data/extractors/content/test_okaymarketing.json diff --git a/tests/data/extractors/test_politico.html b/tests/data/extractors/content/test_politico.html similarity index 100% rename from tests/data/extractors/test_politico.html rename to tests/data/extractors/content/test_politico.html diff --git a/tests/data/extractors/test_politico.json b/tests/data/extractors/content/test_politico.json similarity index 100% rename from tests/data/extractors/test_politico.json rename to tests/data/extractors/content/test_politico.json diff --git a/tests/data/extractors/test_techcrunch1.html b/tests/data/extractors/content/test_techcrunch1.html similarity index 100% rename from tests/data/extractors/test_techcrunch1.html rename to tests/data/extractors/content/test_techcrunch1.html diff --git a/tests/data/extractors/test_techcrunch1.json b/tests/data/extractors/content/test_techcrunch1.json similarity index 100% rename from tests/data/extractors/test_techcrunch1.json rename to tests/data/extractors/content/test_techcrunch1.json diff --git a/tests/data/extractors/test_testHuffingtonPost.html b/tests/data/extractors/content/test_testHuffingtonPost.html similarity index 100% rename from tests/data/extractors/test_testHuffingtonPost.html rename to tests/data/extractors/content/test_testHuffingtonPost.html diff --git a/tests/data/extractors/test_testHuffingtonPost.json b/tests/data/extractors/content/test_testHuffingtonPost.json similarity index 100% rename from tests/data/extractors/test_testHuffingtonPost.json rename to tests/data/extractors/content/test_testHuffingtonPost.json diff --git a/tests/data/extractors/test_time.html b/tests/data/extractors/content/test_time.html similarity index 100% rename from tests/data/extractors/test_time.html rename to tests/data/extractors/content/test_time.html diff --git a/tests/data/extractors/test_time.json b/tests/data/extractors/content/test_time.json similarity index 90% rename from tests/data/extractors/test_time.json rename to tests/data/extractors/content/test_time.json index 31341c9c..05cb400c 100644 --- a/tests/data/extractors/test_time.json +++ b/tests/data/extractors/content/test_time.json @@ -6,8 +6,8 @@ "final_url": "http://www.time.com/time/health/article/0,8599,2011497,00.html", "meta_keywords": "bp, oil, spill, gulf, mexico, invisible, dispersed, deepwater horizon, Charles Hopkinson", "cleaned_text": "This month, the federal government released", - "title": "Invisible Oil from BP Spill May Threaten Gulf Aquatic Life", + "title": "Oil from Spill Could Still Pose Major Threat", "meta_favicon": "http://img.timeinc.net/time/favicon.ico", "meta_lang": null } -} \ No newline at end of file +} diff --git a/tests/data/extractors/test_time2.html b/tests/data/extractors/content/test_time2.html similarity index 100% rename from tests/data/extractors/test_time2.html rename to tests/data/extractors/content/test_time2.html diff --git a/tests/data/extractors/test_time2.json b/tests/data/extractors/content/test_time2.json similarity index 100% rename from tests/data/extractors/test_time2.json rename to tests/data/extractors/content/test_time2.json diff --git a/tests/data/extractors/test_usatoday_issue_74.html b/tests/data/extractors/content/test_usatoday_issue_74.html similarity index 100% rename from tests/data/extractors/test_usatoday_issue_74.html rename to tests/data/extractors/content/test_usatoday_issue_74.html diff --git a/tests/data/extractors/test_usatoday_issue_74.json b/tests/data/extractors/content/test_usatoday_issue_74.json similarity index 100% rename from tests/data/extractors/test_usatoday_issue_74.json rename to tests/data/extractors/content/test_usatoday_issue_74.json diff --git a/tests/data/extractors/test_yahoo.html b/tests/data/extractors/content/test_yahoo.html similarity index 100% rename from tests/data/extractors/test_yahoo.html rename to tests/data/extractors/content/test_yahoo.html diff --git a/tests/data/extractors/test_yahoo.json b/tests/data/extractors/content/test_yahoo.json similarity index 100% rename from tests/data/extractors/test_yahoo.json rename to tests/data/extractors/content/test_yahoo.json diff --git a/tests/data/images/test_basic_image/50850547cc7310bc53e30e802c6318f1 b/tests/data/extractors/images/test_basic_image/50850547cc7310bc53e30e802c6318f1 similarity index 100% rename from tests/data/images/test_basic_image/50850547cc7310bc53e30e802c6318f1 rename to tests/data/extractors/images/test_basic_image/50850547cc7310bc53e30e802c6318f1 diff --git a/tests/data/images/test_basic_image/test_basic_image.html b/tests/data/extractors/images/test_basic_image/test_basic_image.html similarity index 100% rename from tests/data/images/test_basic_image/test_basic_image.html rename to tests/data/extractors/images/test_basic_image/test_basic_image.html diff --git a/tests/data/images/test_basic_image/test_basic_image.json b/tests/data/extractors/images/test_basic_image/test_basic_image.json similarity index 100% rename from tests/data/images/test_basic_image/test_basic_image.json rename to tests/data/extractors/images/test_basic_image/test_basic_image.json diff --git a/tests/data/images/test_known_image_css_class/test_known_image_css_class.html b/tests/data/extractors/images/test_known_image_css_class/test_known_image_css_class.html similarity index 100% rename from tests/data/images/test_known_image_css_class/test_known_image_css_class.html rename to tests/data/extractors/images/test_known_image_css_class/test_known_image_css_class.html diff --git a/tests/data/images/test_known_image_css_class/test_known_image_css_class.json b/tests/data/extractors/images/test_known_image_css_class/test_known_image_css_class.json similarity index 100% rename from tests/data/images/test_known_image_css_class/test_known_image_css_class.json rename to tests/data/extractors/images/test_known_image_css_class/test_known_image_css_class.json diff --git a/tests/data/images/test_known_image_css_id/test_known_image_css_id.html b/tests/data/extractors/images/test_known_image_css_id/test_known_image_css_id.html similarity index 100% rename from tests/data/images/test_known_image_css_id/test_known_image_css_id.html rename to tests/data/extractors/images/test_known_image_css_id/test_known_image_css_id.html diff --git a/tests/data/images/test_known_image_css_id/test_known_image_css_id.json b/tests/data/extractors/images/test_known_image_css_id/test_known_image_css_id.json similarity index 100% rename from tests/data/images/test_known_image_css_id/test_known_image_css_id.json rename to tests/data/extractors/images/test_known_image_css_id/test_known_image_css_id.json diff --git a/tests/data/images/test_known_image_css_parent_class/test_known_image_css_parent_class.html b/tests/data/extractors/images/test_known_image_css_parent_class/test_known_image_css_parent_class.html similarity index 100% rename from tests/data/images/test_known_image_css_parent_class/test_known_image_css_parent_class.html rename to tests/data/extractors/images/test_known_image_css_parent_class/test_known_image_css_parent_class.html diff --git a/tests/data/images/test_known_image_css_parent_class/test_known_image_css_parent_class.json b/tests/data/extractors/images/test_known_image_css_parent_class/test_known_image_css_parent_class.json similarity index 100% rename from tests/data/images/test_known_image_css_parent_class/test_known_image_css_parent_class.json rename to tests/data/extractors/images/test_known_image_css_parent_class/test_known_image_css_parent_class.json diff --git a/tests/data/images/test_known_image_css_parent_id/test_known_image_css_parent_id.html b/tests/data/extractors/images/test_known_image_css_parent_id/test_known_image_css_parent_id.html similarity index 100% rename from tests/data/images/test_known_image_css_parent_id/test_known_image_css_parent_id.html rename to tests/data/extractors/images/test_known_image_css_parent_id/test_known_image_css_parent_id.html diff --git a/tests/data/images/test_known_image_css_parent_id/test_known_image_css_parent_id.json b/tests/data/extractors/images/test_known_image_css_parent_id/test_known_image_css_parent_id.json similarity index 100% rename from tests/data/images/test_known_image_css_parent_id/test_known_image_css_parent_id.json rename to tests/data/extractors/images/test_known_image_css_parent_id/test_known_image_css_parent_id.json diff --git a/tests/data/images/test_known_image_empty_src/test_known_image_empty_src.html b/tests/data/extractors/images/test_known_image_empty_src/test_known_image_empty_src.html similarity index 100% rename from tests/data/images/test_known_image_empty_src/test_known_image_empty_src.html rename to tests/data/extractors/images/test_known_image_empty_src/test_known_image_empty_src.html diff --git a/tests/data/images/test_known_image_empty_src/test_known_image_empty_src.json b/tests/data/extractors/images/test_known_image_empty_src/test_known_image_empty_src.json similarity index 100% rename from tests/data/images/test_known_image_empty_src/test_known_image_empty_src.json rename to tests/data/extractors/images/test_known_image_empty_src/test_known_image_empty_src.json diff --git a/tests/data/images/test_known_image_name_parent/test_known_image_name_parent.html b/tests/data/extractors/images/test_known_image_name_parent/test_known_image_name_parent.html similarity index 100% rename from tests/data/images/test_known_image_name_parent/test_known_image_name_parent.html rename to tests/data/extractors/images/test_known_image_name_parent/test_known_image_name_parent.html diff --git a/tests/data/images/test_known_image_name_parent/test_known_image_name_parent.json b/tests/data/extractors/images/test_known_image_name_parent/test_known_image_name_parent.json similarity index 100% rename from tests/data/images/test_known_image_name_parent/test_known_image_name_parent.json rename to tests/data/extractors/images/test_known_image_name_parent/test_known_image_name_parent.json diff --git a/tests/data/images/test_opengraph_tag/test_opengraph_tag.html b/tests/data/extractors/images/test_opengraph_tag/test_opengraph_tag.html similarity index 100% rename from tests/data/images/test_opengraph_tag/test_opengraph_tag.html rename to tests/data/extractors/images/test_opengraph_tag/test_opengraph_tag.html diff --git a/tests/data/images/test_opengraph_tag/test_opengraph_tag.json b/tests/data/extractors/images/test_opengraph_tag/test_opengraph_tag.json similarity index 100% rename from tests/data/images/test_opengraph_tag/test_opengraph_tag.json rename to tests/data/extractors/images/test_opengraph_tag/test_opengraph_tag.json diff --git a/tests/data/extractors/links/test_links.html b/tests/data/extractors/links/test_links.html new file mode 100644 index 00000000..c097d4ee --- /dev/null +++ b/tests/data/extractors/links/test_links.html @@ -0,0 +1,16 @@ + + +
    +

    + TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. + links + Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

    +
    +
    +

    + TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

    +
    + + diff --git a/tests/data/extractors/links/test_links.json b/tests/data/extractors/links/test_links.json new file mode 100644 index 00000000..74f1c682 --- /dev/null +++ b/tests/data/extractors/links/test_links.json @@ -0,0 +1,6 @@ +{ + "url": "http://exemple.com/links/", + "expected": { + "links": 2 + } +} diff --git a/tests/data/extractors/opengraph/test_opengraph.html b/tests/data/extractors/opengraph/test_opengraph.html new file mode 100644 index 00000000..bcc8cbb8 --- /dev/null +++ b/tests/data/extractors/opengraph/test_opengraph.html @@ -0,0 +1,16 @@ + + + + + + + + + +
    +

    + TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

    +
    + + diff --git a/tests/data/extractors/opengraph/test_opengraph.json b/tests/data/extractors/opengraph/test_opengraph.json new file mode 100644 index 00000000..ba05d768 --- /dev/null +++ b/tests/data/extractors/opengraph/test_opengraph.json @@ -0,0 +1,12 @@ +{ + "url": "http://exemple.com/test_opengraphcontent", + "expected": { + "opengraph": { + "url": "http://www.somenews.com/2012/09/19/nyregion/some-news-article.html?pagewanted=all", + "image": "http://graphics8.somenews.com/images/2012/09/19/region/some-news-image.jpg", + "type": "article", + "description": "Some News Happened in New York", + "title": "Some News Article Story" + } + } +} diff --git a/tests/data/extractors/publishdate/test_publish_date.html b/tests/data/extractors/publishdate/test_publish_date.html new file mode 100644 index 00000000..6ce2b927 --- /dev/null +++ b/tests/data/extractors/publishdate/test_publish_date.html @@ -0,0 +1,7 @@ + + + + + + + diff --git a/tests/data/extractors/publishdate/test_publish_date.json b/tests/data/extractors/publishdate/test_publish_date.json new file mode 100644 index 00000000..a37e1173 --- /dev/null +++ b/tests/data/extractors/publishdate/test_publish_date.json @@ -0,0 +1,6 @@ +{ + "url": "http://example.com/example", + "expected": { + "publish_date": "2014-06-30T16:54:02+00:00" + } +} diff --git a/tests/data/extractors/publishdate/test_publish_date_article.html b/tests/data/extractors/publishdate/test_publish_date_article.html new file mode 100644 index 00000000..3d03667e --- /dev/null +++ b/tests/data/extractors/publishdate/test_publish_date_article.html @@ -0,0 +1,7 @@ + + + + + + + diff --git a/tests/data/extractors/publishdate/test_publish_date_article.json b/tests/data/extractors/publishdate/test_publish_date_article.json new file mode 100644 index 00000000..06f14aa6 --- /dev/null +++ b/tests/data/extractors/publishdate/test_publish_date_article.json @@ -0,0 +1,6 @@ +{ + "url": "http://example.com/example", + "expected": { + "publish_date": "2012-01-11T15:55:01+00:00" + } +} diff --git a/tests/data/extractors/publishdate/test_publish_date_rnews.html b/tests/data/extractors/publishdate/test_publish_date_rnews.html new file mode 100644 index 00000000..ca71f718 --- /dev/null +++ b/tests/data/extractors/publishdate/test_publish_date_rnews.html @@ -0,0 +1,7 @@ + + + + + + + diff --git a/tests/data/extractors/publishdate/test_publish_date_rnews.json b/tests/data/extractors/publishdate/test_publish_date_rnews.json new file mode 100644 index 00000000..623b13bb --- /dev/null +++ b/tests/data/extractors/publishdate/test_publish_date_rnews.json @@ -0,0 +1,6 @@ +{ + "url": "http://example.com/example", + "expected": { + "publish_date": "2010-02-22T11:53:04+00:00" + } +} diff --git a/tests/data/extractors/publishdate/test_publish_date_schema.html b/tests/data/extractors/publishdate/test_publish_date_schema.html new file mode 100644 index 00000000..8a666dfa --- /dev/null +++ b/tests/data/extractors/publishdate/test_publish_date_schema.html @@ -0,0 +1,15 @@ + + + + test video + + + +
    + +

    + TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

    +
    + + diff --git a/tests/data/extractors/publishdate/test_publish_date_schema.json b/tests/data/extractors/publishdate/test_publish_date_schema.json new file mode 100644 index 00000000..8e150921 --- /dev/null +++ b/tests/data/extractors/publishdate/test_publish_date_schema.json @@ -0,0 +1,6 @@ +{ + "url": "http://example.com/example", + "expected": { + "publish_date": "2014-10-09T12:06:16" + } +} diff --git a/tests/data/extractors/test_tags_abcau.html b/tests/data/extractors/tags/test_tags_abcau.html similarity index 100% rename from tests/data/extractors/test_tags_abcau.html rename to tests/data/extractors/tags/test_tags_abcau.html diff --git a/tests/data/extractors/test_tags_abcau.json b/tests/data/extractors/tags/test_tags_abcau.json similarity index 100% rename from tests/data/extractors/test_tags_abcau.json rename to tests/data/extractors/tags/test_tags_abcau.json diff --git a/tests/data/extractors/test_tags_cnet.html b/tests/data/extractors/tags/test_tags_cnet.html similarity index 100% rename from tests/data/extractors/test_tags_cnet.html rename to tests/data/extractors/tags/test_tags_cnet.html diff --git a/tests/data/extractors/test_tags_cnet.json b/tests/data/extractors/tags/test_tags_cnet.json similarity index 100% rename from tests/data/extractors/test_tags_cnet.json rename to tests/data/extractors/tags/test_tags_cnet.json diff --git a/tests/data/extractors/test_tags_deadline.html b/tests/data/extractors/tags/test_tags_deadline.html similarity index 100% rename from tests/data/extractors/test_tags_deadline.html rename to tests/data/extractors/tags/test_tags_deadline.html diff --git a/tests/data/extractors/test_tags_deadline.json b/tests/data/extractors/tags/test_tags_deadline.json similarity index 100% rename from tests/data/extractors/test_tags_deadline.json rename to tests/data/extractors/tags/test_tags_deadline.json diff --git a/tests/data/extractors/test_tags_kexp.html b/tests/data/extractors/tags/test_tags_kexp.html similarity index 100% rename from tests/data/extractors/test_tags_kexp.html rename to tests/data/extractors/tags/test_tags_kexp.html diff --git a/tests/data/extractors/test_tags_kexp.json b/tests/data/extractors/tags/test_tags_kexp.json similarity index 100% rename from tests/data/extractors/test_tags_kexp.json rename to tests/data/extractors/tags/test_tags_kexp.json diff --git a/tests/data/extractors/test_tags_wnyc.html b/tests/data/extractors/tags/test_tags_wnyc.html similarity index 100% rename from tests/data/extractors/test_tags_wnyc.html rename to tests/data/extractors/tags/test_tags_wnyc.html diff --git a/tests/data/extractors/test_tags_wnyc.json b/tests/data/extractors/tags/test_tags_wnyc.json similarity index 100% rename from tests/data/extractors/test_tags_wnyc.json rename to tests/data/extractors/tags/test_tags_wnyc.json diff --git a/tests/data/extractors/test_businessinsider1.html b/tests/data/extractors/test_businessinsider1.html deleted file mode 100644 index 18603a35..00000000 --- a/tests/data/extractors/test_businessinsider1.html +++ /dev/null @@ -1,2211 +0,0 @@ - - - - - MEANWHILE: Developments In Greece... - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    -
    -
    - - - - - - - -
    -
    - -
    - - - -
    -
    -
    -
    - -
    -
    - -
    - Business Insider - - Business Insider - - -
    -
    - - - - - - -
    -
    -
    -
    - -
    - -
      -
    • - -
    • -
    • - - -
    • - - -
    -
    -
    -
    -
    - -
    - -
    -
    - -
    - -
    - - -
    - - - - - - - -
    -
    - -
    -
    - - -

    MEANWHILE: Developments In Greece...

    - -
    -
    - - - - - - -
    - -
    -
    - -

    As everyone in the world was transfixed on the Fed, Greece continues to do what it takes to get its next bailout tranche and stay in the Eurozone.

    - -

    The Greek cabinet agreed on yet another round of austerity measures today. The details come to us from Reuters:

    -
      -
    • Pensions of more than $1,642.913 a month wil be cut by 20%
    • -
    • Payments to state workers who retired before age 55 will be reduced
    • -
    • 30,000 civil servants will go into "labor reserve"-- that means their pay will be reduced to 60% of their salaries while they have 12 months to find a new job in the state sector or be laid off.
    • -
    -

    The government also promised to extend a new real estate, intended to end next year, til 2014.

    - -

    An official sign-off from the Troika is still not official AND, according to journalist Matina Stevis, there still needs to be a parliament vote.

    -
    - -
    -
    -

    - Please follow Money Game on Twitter and Facebook. -
    Follow Linette Lopez on Twitter. -
    Ask Linette A Question > - -

    - - - - - -
    - -
    - - - -
    -
    - -
    - -
    - -

    -
    -
    - - - -
    -
    - - Share: -
    -

    - - - - - - - -
    - - - - - -
    -
    -
    -
    -
    -
    -
    - - writes primarily for Money Game, but dabbles in Clusterstock and Europe. She also edits the site's podcasts.
    - - Contact: - -
    -
    e-mail:
    -
    - - - - -
    - - Subscribe to her - - - - twitter feed
    - -
    -
    -
    - -
    -
    - - - -
    - - - - -
    - -
    - -

    The Water Cooler
    -
    -
    - Receive email updates on new comments! -
    - -
    - 5 Comments - - -

    -
    - -
    -
    - - -
    -
    -
    -
    -
    - - - 0 - -
    -
    - - 2 - - -
    - -
    - - - - - facebook_expert - - - - - on - - Sep 21, 3:29 PM - said: - - - - -
    - -
    - Reply - -
    -
    - -
    -
    - -
    -
    - - -
    -
    - -
    -
    - -
    - - - 0 - -
    -
    - - -
    -
    - - - - zorba the meek - - - - - on - - Sep 21, 3:38 PM - - said: - - - -
    - - -
    - the average stavros souvlaki on the street how only two choices 1) leave the country which will soon resemble east germany during the cold war or 2) order the baklava, it's delicious.
    - -
    -
    - -
    - Reply -
    -
    - -
    -
    - -
    -
    - - - -
    -
    -
    -
    -
    - - 0 - - -
    - -
    - -
    -
    - - - - depression - - - - - - on - - Sep 21, 3:56 PM - said: - - - -
    - - -
    - these types of cuts are unthinkable ......
    - -
    -
    - -
    - Reply -
    -
    - -
    - -
    - -
    -
    - - -
    -
    -
    -
    -
    - - - 2 - -
    -
    - - 0 - - -
    - -
    - -
    - - - - marktheshark - - - - - on - - Sep 21, 4:23 PM - said: - - - -
    - - -
    - Greece will blow up right after the next election. Whoever is up against the status quo will demand "hope" and "change", promising to return to the Drachma and preserve social spending. Papandreou is obviously on his way out, no Prime Minister who is rioted against stays in power in a legitimate democracy. If we don't see any difference with the next party in power, Greece will rip itself apart. The birthplace of democracy will be the deathbed of it.
    - -
    -
    - -
    - Reply - -
    -
    - -
    -
    - -
    -
    - - -
    -
    - -
    -
    - -
    - - - 1 - -
    -
    - - -
    -
    - - - - Jamie Sims - - - - - on - - Sep 22, 5:00 AM - - said: - - - -
    - - -
    - Why is it the rich of Greece has still got away without paying as most of them got their fortunes through corruption. We want the government to be punished for their poor behavior towards their country and people but unfortunately it is the people who are paying the price with their lives, jobs, houses. The Government still has not collected taxes from 2009. There is still tax free on boats which the government loses out on 300 million euro per year because of a stupid loop hole and not one politician has taken a pay cut. and yet when people were starving and the markets were in turmoil the government went on their summer hols, now is that the behavior of a government who is determined to make change?
    - -
    -
    - -
    - Reply -
    -
    - -
    -
    - -
    -
    - -
    - - - -
    - - - - - -
    -

    - - Join the discussion with Business Insider -
    - Login With Facebook - -

    - - -
    -
    - - - - - -
    - -
    - - -
    - -
    - - -
    -
    - - -
    - - -
    - - -
    - - - - - - - -
    - -
    -
    -
    - -
    - - - - - - - - -
    - - - -
    - -
    -
    -
    -
    - -
    - - - - - - - - - - -
    -

    Facebook's New Layout: Pros & Cons

    -
    - -
    - -
    - -
    -
    - - - - -
    -
    - -
    -

    Get Money Game Emails & Alerts

    - - -

    Learn More »

    - - -
    -
    -
    - - - - -
    -

    Advertisement

    - -
    - - -
    - -
    - -
    -

    Your Money

    -
    - - - - - - - - - - - - - - - - - - -
    NASDAQ Composite2,456+27.56(+1.122%)
    S&P 5001,130+6.87(+0.608%)
    NYSE Composite6,727+44.11(+0.656%)
    -
    - - -
    - -
    -
    - - - -

    Active Users on BI right now...
    Click for more live stats »

    -
    - - - - - - - - - -
    - - Get Business Insider Mobile - -
    - - - -
    -

    - Startup Document Center -

    - -
    Templates To Jump Start Your Business
    - -
    - - - - -
    - -
    - - - - - - -
    - - - - - -
    - - - - - - - -
    - -
    - - - -
    -

    Advertisement

    - -
    - - - -
    -
    - - - - - -
    -

    Thanks to our partners

    -
    - Datapipe - OpenX - Catchpoint - Web Performance Monitoring - - Ooyala - Ad-Juster - Financial Content -
    -
    -
    - -
    - -
    - -
    - - - - -
    -
    - -
    - -
    - - - - - - - - - - - - - -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/tests/data/extractors/test_businessinsider1.json b/tests/data/extractors/test_businessinsider1.json deleted file mode 100644 index a12c5838..00000000 --- a/tests/data/extractors/test_businessinsider1.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "url": "http://articles.businessinsider.com/2011-09-21/markets/30183619_1_parliament-vote-greece-civil-servants", - "expected": { - "meta_description": "More moves to survive.", - "domain": "articles.businessinsider.com", - "final_url": "http://articles.businessinsider.com/2011-09-21/markets/30183619_1_parliament-vote-greece-civil-servants", - "meta_keywords": "Economy, Greece, Austerity, Economic Crisis, Eurozone, Euro, Europe, Linette Lopez", - "cleaned_text": "As everyone in the world was transfixed on the Fed", - "meta_favicon": "http://static7.businessinsider.com/assets/images/faviconBI.ico", - "meta_lang": "en" - } -} \ No newline at end of file diff --git a/tests/data/extractors/test_businessinsider2.html b/tests/data/extractors/test_businessinsider2.html deleted file mode 100644 index 56573300..00000000 --- a/tests/data/extractors/test_businessinsider2.html +++ /dev/null @@ -1,2278 +0,0 @@ - - - - - GOLDMAN: 4 Key Points On The FOMC Announcement - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    -
    -
    - - - - - - - -
    -
    - -
    - - - -
    -
    -
    -
    - -
    -
    - -
    - Business Insider - - Business Insider - - -
    -
    - - - - - - -
    -
    -
    -
    - -
    - -
      -
    • - -
    • -
    • - - -
    • - - -
    -
    -
    -
    -
    - -
    - -
    -
    - -
    - -
    - - -
    - - - - - - - -
    -
    - -
    -
    - - -

    GOLDMAN: 4 Key Points On The FOMC Announcement

    - -
    -
    - - - - - - -
    - -
    -
    - -

    twist

    Image: YouTUbe

    From Goldman on the FOMC operation twist announcement:

    - -

    -------------

    -

    1. As we had expected, the Federal Open Market Committee decided to "do the twist" and increase the duration of its securities holdings by selling shorter-maturity securities ($400bn of Treasuries with maturity of 3 years or less) and buying longer-maturity securities ($400bn of Treasuries with maturity 6-30 years).

    2. The Fed chose to maintain the interest rate on excess reserves (IOER) at 25bp, contrary to our expectations of a small cut, but overall the details of today's action were more aggressive than expected in two respects: First, a relatively large portion of the purchases will occur at the long end (29% in the 20-30 year maturity bucket), implying a total impact of more than $400bn in 10-year equivalents, versus market expectations of perhaps $300-350bn. Second, the Fed will reinvest maturing and prepaid agency MBS and agency debt in agency MBS, rather than Treasuries, suggesting a bit more support for the housing sector. The statement retained an easing bias, noting again that the FOMC "is prepared to employ its tools" to "promote a stronger economic recovery in a context of price stability".

    3. Consistent with the more aggressive policy easing, the statement emphasizes the weak state of the economy, suggesting "continuing weakness in overall labor market conditions" and "only a modest pace" of growth in consumer spending. The FOMC notes the moderation in (headline) inflation in recent months and, as before, expects it to "settle...at levels at or below those consistent with the Committee's dual mandate". While the FOMC still forecasts some improvement in the pace of growth, "there are significant downside risks to the economic outlook, including strains in global financial markets".

    4. Once again, three FOMC members--Dallas Fed President Fisher, Minneapolis Fed President Kocherlakota, and Philadelphia Fed President Plosser--dissented, with the statement noting only that they "did not support additional policy accommodation at this time".

    -
    - -
    -
    -

    - Please follow Money Game on Twitter and Facebook. -
    Follow Joe Weisenthal on Twitter. -
    Ask Joe A Question > - -

    - - - - - - -
    - -
    - - -
    -
    - - -
    - -
    - -

    -
    -
    - - -
    - -
    - - Share: -
    -

    - - - - - - - -
    - - - - - -
    -
    - -
    -
    -
    -
    -
    - -
    - Contact: - -
    - -
    e-mail:
    -
    - -
    AIM:
    -
    openfilerook
    - -
    Work Phone:
    -
    917-621-7438
    - - -
    SMS:
    -
    917-621-7438
    - -
    - - Subscribe to his - - RSS feed - - | - - twitter feed
    - -
    -
    -
    - -
    -
    - - - -
    - - - - -
    - -
    - -

    The Water Cooler
    -
    -
    - Receive email updates on new comments! -
    - -
    - 6 Comments - - -

    -
    - -
    -
    - - -
    -
    -
    -
    -
    - - - 2 - -
    -
    - - 6 - - -
    - -
    - - - - - Beltway Greg - - - - - on - - Sep 21, 3:05 PM - said: - - - -
    - - - -
    - If you're short you better cover.
    - -
    -
    - -
    - Reply -
    - -
    - -
    -
    - -
    -
    - - -
    -
    -
    - -
    - - - - -
    - - - - QE2 -
    - - What are these? Strikes! Earn three of them in a month, and you'll be sent - to the Penalty Box for 24 hours. How do you earn strikes? Write comments that our editors kick to the Bleachers. - Want to get rid of the strikes and start fresh? Write excellent comments that our editors promote - to the Board Room. - -
    -
    - - - - on - - Sep 21, 3:23 PM - said: - - - -
    - - -
    - - Cover shorts in what? No FED QE3 money - what's exactly going to push the market up higher in the near term, the sputtering U.S. economy - you seen the data recently? International markets in Europe and Asia? U.S. Housing market? Everything has already peaked - commodities, gold, silver, oil. Only emerging bull markets left are in U.S. Dollar and long bonds now.
    - -
    -
    - -
    - Reply -
    -
    - - -
    -
    - -
    -
    - - -
    -
    -
    -
    - - - - -
    - - - - - facebook - - - - - on - - Sep 21, 3:24 PM - said: - - - -
    - - -
    - great FOMC ...it's just like QE2 all over again, so this will lay the groundwork for a strong stock market & commodity rally till may 2012.
    -Interest rates never to go up again
    -Housing, small biz, and job creation just not that important anymore in this profits, earnings, productivity & exports driven economic boom
    -http://seekingalpha.com/user/926530/instablog/full_index
    - -
    -
    - -
    - Reply -
    -
    - -
    - -
    - -
    -
    - - -
    -
    -
    -
    -
    - - - 6 - -
    -
    - - 0 - - -
    - -
    - - - - - r cohn - - - - - on - - Sep 21, 3:25 PM - said: - - - -
    - - - -
    - sell all banks, insurance companie beause of margin compression.sell all muni ond because all pension funds will have to reduce their assumed rate of return
    -sell all companies who cater to the baby boomers as almost all of them will run out of money in 10 years
    -
    -Good job Fed .Your actions have reduced the incentive to save and you have destroyed your own banking system.One of the most immoral moves in American history outside of slavery and wars
    - -
    -
    - -
    - - Reply -
    -
    - -
    -
    - -
    -
    - - -
    - -
    -
    -
    - - - -
    - -
    -
    - - - - Deep Thoughts - - - - - on - - - Sep 21, 11:31 PM - said: - - - -
    - - -
    - huh? what was that noise?
    -Oh it's nothing honey, go back to sleep.
    -Alright sweetie...Zzzzzzzz
    - -
    -
    - -
    - Reply -
    -
    - -
    - -
    - -
    -
    - - -
    -
    -
    -
    -
    - - - 0 - -
    -
    - - 0 - - -
    - -
    - - - - - rhh - - - - - on - - Sep 22, 1:02 AM - said: - - - -
    - - - -
    - Operation Peter /Paul.....you are Peter.
    - -
    -
    - -
    - Reply -
    - -
    - -
    -
    - -
    -
    - -
    - - - -
    - - - - - -
    -

    - Join the discussion with Business Insider -
    - Login With Facebook - - -

    - - -
    -
    - - - - - -
    - -
    - - -
    -
    - - - -
    -
    - - -
    - - -
    - - - -
    - - - - - - - -
    - -
    -
    - -
    - -
    - - - - - - - -
    - - - - -
    - -
    -
    -
    -
    -
    - - - - - - - -
    -
    -
    -

    Get Money Game Emails & Alerts

    - - - -

    Learn More »

    - - - -
    -
    -
    - - - - -
    -

    Advertisement

    - - -
    - - -
    -
    - -
    -

    Your Money

    -
    - - - - - - - - - - - - - - - - - - -
    NASDAQ Composite2,456+24.33(+0.991%)
    S&P 5001,130+8.19(+0.725%)
    NYSE Composite6,727+34.88(+0.519%)
    -
    - - -
    - -
    - -
    - - -

    Active Users on BI right now...
    Click for more live stats »

    -
    - - - - - - - - - - - - - - - -
    - - Get Business Insider Mobile - -
    - - -
    - -

    - Startup Document Center -

    - -
    Templates To Jump Start Your Business
    - -
    - - - - -
    -
    - - - - - - -
    - - - - - -
    - - - - - - - - -
    - - -
    - - - -
    -

    Advertisement

    - -
    - - - -
    -
    - - - - - -
    -

    Thanks to our partners

    -
    - Datapipe - - OpenX - Catchpoint - Web Performance Monitoring - Ooyala - Ad-Juster - Financial Content -
    -
    -
    - -
    - -
    -
    - - - - -
    -
    - -
    - -
    - - - - - - - - - - - - -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/tests/data/extractors/test_businessinsider2.json b/tests/data/extractors/test_businessinsider2.json deleted file mode 100644 index 0329e87e..00000000 --- a/tests/data/extractors/test_businessinsider2.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "url": "http://www.businessinsider.com/goldman-on-the-fed-announcement-2011-9", - "expected": { - "meta_description": "Here it is.", - "domain": "www.businessinsider.com", - "final_url": "http://www.businessinsider.com/goldman-on-the-fed-announcement-2011-9", - "meta_keywords": "Federal Reserve, Joe Weisenthal", - "cleaned_text": "From Goldman on the FOMC operation twist announcement", - "meta_favicon": "http://static7.businessinsider.com/assets/images/faviconBI.ico", - "meta_lang": "en" - } -} \ No newline at end of file diff --git a/tests/data/extractors/title/test_title_empty.html b/tests/data/extractors/title/test_title_empty.html new file mode 100644 index 00000000..63a8cab9 --- /dev/null +++ b/tests/data/extractors/title/test_title_empty.html @@ -0,0 +1,12 @@ + + + + + +
    +

    + TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

    +
    + + diff --git a/tests/data/extractors/title/test_title_empty.json b/tests/data/extractors/title/test_title_empty.json new file mode 100644 index 00000000..c31bab9f --- /dev/null +++ b/tests/data/extractors/title/test_title_empty.json @@ -0,0 +1,6 @@ +{ + "url": "http://exemple.com/test_title_empty.html", + "expected": { + "title": "" + } +} diff --git a/tests/data/extractors/title/test_title_opengraph.html b/tests/data/extractors/title/test_title_opengraph.html new file mode 100644 index 00000000..6e6c0c64 --- /dev/null +++ b/tests/data/extractors/title/test_title_opengraph.html @@ -0,0 +1,14 @@ + + + + + Wrong article title - website + + +
    +

    + TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

    +
    + + diff --git a/tests/data/extractors/title/test_title_opengraph.json b/tests/data/extractors/title/test_title_opengraph.json new file mode 100644 index 00000000..b4b6cdea --- /dev/null +++ b/tests/data/extractors/title/test_title_opengraph.json @@ -0,0 +1,6 @@ +{ + "url": "http://exemple.com/test_opengraphcontent", + "expected": { + "title": "Good article title" + } +} diff --git a/tests/data/extractors/tweets/test_tweet.html b/tests/data/extractors/tweets/test_tweet.html new file mode 100644 index 00000000..0a390dd8 --- /dev/null +++ b/tests/data/extractors/tweets/test_tweet.html @@ -0,0 +1,21 @@ + + +
    +

    + TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. +

    + + Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

    +
    +
    +

    + TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

    + + + +

    +
    + + diff --git a/tests/data/extractors/tweets/test_tweet.json b/tests/data/extractors/tweets/test_tweet.json new file mode 100644 index 00000000..80986ad6 --- /dev/null +++ b/tests/data/extractors/tweets/test_tweet.json @@ -0,0 +1,6 @@ +{ + "url": "http://exemple.com/tweet/", + "expected": { + "tweets": 2 + } +} diff --git a/tests/data/videos/test_embed.html b/tests/data/extractors/videos/test_embed.html similarity index 100% rename from tests/data/videos/test_embed.html rename to tests/data/extractors/videos/test_embed.html diff --git a/tests/data/videos/test_embed.json b/tests/data/extractors/videos/test_embed.json similarity index 100% rename from tests/data/videos/test_embed.json rename to tests/data/extractors/videos/test_embed.json diff --git a/tests/data/videos/test_iframe.html b/tests/data/extractors/videos/test_iframe.html similarity index 100% rename from tests/data/videos/test_iframe.html rename to tests/data/extractors/videos/test_iframe.html diff --git a/tests/data/videos/test_iframe.json b/tests/data/extractors/videos/test_iframe.json similarity index 100% rename from tests/data/videos/test_iframe.json rename to tests/data/extractors/videos/test_iframe.json diff --git a/tests/data/videos/test_object.html b/tests/data/extractors/videos/test_object.html similarity index 100% rename from tests/data/videos/test_object.html rename to tests/data/extractors/videos/test_object.html diff --git a/tests/data/videos/test_object.json b/tests/data/extractors/videos/test_object.json similarity index 100% rename from tests/data/videos/test_object.json rename to tests/data/extractors/videos/test_object.json diff --git a/goose/images/__init__.py b/tests/extractors/__init__.py similarity index 100% rename from goose/images/__init__.py rename to tests/extractors/__init__.py diff --git a/tests/extractors/authors.py b/tests/extractors/authors.py new file mode 100644 index 00000000..709040c1 --- /dev/null +++ b/tests/extractors/authors.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from base import TestExtractionBase + + +class TestArticleAuthor(TestExtractionBase): + + def test_author_schema(self): + article = self.getArticle() + fields = ['authors'] + self.runArticleAssertions(article=article, fields=fields) diff --git a/tests/extractors/base.py b/tests/extractors/base.py new file mode 100644 index 00000000..e19d20e0 --- /dev/null +++ b/tests/extractors/base.py @@ -0,0 +1,235 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import os +import json +import urllib2 +import unittest +import socket + +from StringIO import StringIO + +from goose import Goose +from goose.utils import FileHelper +from goose.configuration import Configuration + + +CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) + + +# Response +class MockResponse(): + """\ + Base mock response class + """ + code = 200 + msg = "OK" + + def __init__(self, cls): + self.cls = cls + + def content(self): + return "response" + + def response(self, req): + data = self.content(req) + url = req.get_full_url() + resp = urllib2.addinfourl(StringIO(data), data, url) + resp.code = self.code + resp.msg = self.msg + return resp + + +class MockHTTPHandler(urllib2.HTTPHandler, urllib2.HTTPSHandler): + """\ + Mocked HTTPHandler in order to query APIs locally + """ + cls = None + + def https_open(self, req): + return self.http_open(req) + + def http_open(self, req): + r = self.cls.callback(self.cls) + return r.response(req) + + @staticmethod + def patch(cls): + opener = urllib2.build_opener(MockHTTPHandler) + urllib2.install_opener(opener) + # dirty ! + for h in opener.handlers: + if isinstance(h, MockHTTPHandler): + h.cls = cls + return [h for h in opener.handlers if isinstance(h, MockHTTPHandler)][0] + + @staticmethod + def unpatch(): + # urllib2 + urllib2._opener = None + + +class BaseMockTests(unittest.TestCase): + """\ + Base Mock test case + """ + callback = MockResponse + + def setUp(self): + # patch DNS + self.original_getaddrinfo = socket.getaddrinfo + socket.getaddrinfo = self.new_getaddrinfo + MockHTTPHandler.patch(self) + + def tearDown(self): + MockHTTPHandler.unpatch() + # DNS + socket.getaddrinfo = self.original_getaddrinfo + + def new_getaddrinfo(self, *args): + return [(2, 1, 6, '', ('127.0.0.1', 0))] + + def _get_current_testname(self): + return self.id().split('.')[-1:][0] + + +class MockResponseExtractors(MockResponse): + def content(self, req): + test, suite, module, cls, func = self.cls.id().split('.') + path = os.path.join( + os.path.dirname(CURRENT_PATH), + "data", + suite, + module, + "%s.html" % func) + path = os.path.abspath(path) + content = FileHelper.loadResourceFile(path) + return content + + +class TestExtractionBase(BaseMockTests): + """\ + Extraction test case + """ + callback = MockResponseExtractors + + def getRawHtml(self): + test, suite, module, cls, func = self.id().split('.') + path = os.path.join( + os.path.dirname(CURRENT_PATH), + "data", + suite, + module, + "%s.html" % func) + path = os.path.abspath(path) + content = FileHelper.loadResourceFile(path) + return content + + def loadData(self): + """\ + + """ + test, suite, module, cls, func = self.id().split('.') + path = os.path.join( + os.path.dirname(CURRENT_PATH), + "data", + suite, + module, + "%s.json" % func) + path = os.path.abspath(path) + content = FileHelper.loadResourceFile(path) + self.data = json.loads(content) + + def assert_cleaned_text(self, field, expected_value, result_value): + """\ + + """ + # # TODO : handle verbose level in tests + # print "\n=======================::. ARTICLE REPORT %s .::======================\n" % self.id() + # print 'expected_value (%s) \n' % len(expected_value) + # print expected_value + # print "-------" + # print 'result_value (%s) \n' % len(result_value) + # print result_value + + # cleaned_text is Null + msg = u"Resulting article text was NULL!" + self.assertNotEqual(result_value, None, msg=msg) + + # cleaned_text length + msg = u"Article text was not as long as expected beginning!" + self.assertTrue(len(expected_value) <= len(result_value), msg=msg) + + # clean_text value + result_value = result_value[0:len(expected_value)] + msg = u"The beginning of the article text was not as expected!" + self.assertEqual(expected_value, result_value, msg=msg) + + def runArticleAssertions(self, article, fields): + """\ + + """ + for field in fields: + expected_value = self.data['expected'][field] + result_value = getattr(article, field, None) + + # custom assertion for a given field + assertion = 'assert_%s' % field + if hasattr(self, assertion): + getattr(self, assertion)(field, expected_value, result_value) + continue + + # default assertion + msg = u"Error %s \nexpected: %s\nresult: %s" % (field, expected_value, result_value) + self.assertEqual(expected_value, result_value, msg=msg) + + def extract(self, instance): + article = instance.extract(url=self.data['url']) + return article + + def getConfig(self): + config = Configuration() + config.enable_image_fetching = False + return config + + def getArticle(self): + """\ + + """ + # load test case data + self.loadData() + + # basic configuration + # no image fetching + config = self.getConfig() + self.parser = config.get_parser() + + # target language + # needed for non english language most of the time + target_language = self.data.get('target_language') + if target_language: + config.target_language = target_language + config.use_meta_language = False + + # run goose + g = Goose(config=config) + return self.extract(g) diff --git a/tests/extractors.py b/tests/extractors/content.py similarity index 65% rename from tests/extractors.py rename to tests/extractors/content.py index 84ba0502..30dc2754 100644 --- a/tests/extractors.py +++ b/tests/extractors/content.py @@ -20,147 +20,13 @@ See the License for the specific language governing permissions and limitations under the License. """ -import os -import json +from base import TestExtractionBase -from base import BaseMockTests, MockResponse - -from goose import Goose -from goose.utils import FileHelper -from goose.configuration import Configuration from goose.text import StopWordsChinese from goose.text import StopWordsArabic from goose.text import StopWordsKorean -CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) - - -class MockResponseExtractors(MockResponse): - def content(self, req): - current_test = self.cls._get_current_testname() - path = os.path.join(CURRENT_PATH, "data", "extractors", "%s.html" % current_test) - path = os.path.abspath(path) - content = FileHelper.loadResourceFile(path) - return content - - -class TestExtractionBase(BaseMockTests): - """\ - Extraction test case - """ - callback = MockResponseExtractors - - def getRawHtml(self): - suite, module, cls, func = self.id().split('.') - path = os.path.join(CURRENT_PATH, "data", module, "%s.html" % func) - path = os.path.abspath(path) - content = FileHelper.loadResourceFile(path) - return content - - def loadData(self): - """\ - - """ - suite, module, cls, func = self.id().split('.') - path = os.path.join(CURRENT_PATH, "data", module, "%s.json" % func) - path = os.path.abspath(path) - content = FileHelper.loadResourceFile(path) - self.data = json.loads(content) - - def assert_cleaned_text(self, field, expected_value, result_value): - """\ - - """ - # # TODO : handle verbose level in tests - # print "\n=======================::. ARTICLE REPORT %s .::======================\n" % self.id() - # print 'expected_value (%s) \n' % len(expected_value) - # print expected_value - # print "-------" - # print 'result_value (%s) \n' % len(result_value) - # print result_value - - # cleaned_text is Null - msg = u"Resulting article text was NULL!" - self.assertNotEqual(result_value, None, msg=msg) - - # cleaned_text length - msg = u"Article text was not as long as expected beginning!" - self.assertTrue(len(expected_value) <= len(result_value), msg=msg) - - # clean_text value - result_value = result_value[0:len(expected_value)] - msg = u"The beginning of the article text was not as expected!" - self.assertEqual(expected_value, result_value, msg=msg) - - def assert_tags(self, field, expected_value, result_value): - """\ - - """ - # as we have a set in expected_value and a list in result_value - # make result_value a set - expected_value = set(expected_value) - - # check if both have the same number of items - msg = (u"expected tags set and result tags set" - u"don't have the same number of items") - self.assertEqual(len(result_value), len(expected_value), msg=msg) - - # check if each tag in result_value is in expected_value - for tag in result_value: - self.assertTrue(tag in expected_value) - - def runArticleAssertions(self, article, fields): - """\ - - """ - for field in fields: - expected_value = self.data['expected'][field] - result_value = getattr(article, field, None) - - # custom assertion for a given field - assertion = 'assert_%s' % field - if hasattr(self, assertion): - getattr(self, assertion)(field, expected_value, result_value) - continue - - # default assertion - msg = u"Error %s" % field - self.assertEqual(expected_value, result_value, msg=msg) - - def extract(self, instance): - article = instance.extract(url=self.data['url']) - return article - - def getConfig(self): - config = Configuration() - config.enable_image_fetching = False - return config - - def getArticle(self): - """\ - - """ - # load test case data - self.loadData() - - # basic configuration - # no image fetching - config = self.getConfig() - self.parser = config.get_parser() - - # target language - # needed for non english language most of the time - target_language = self.data.get('target_language') - if target_language: - config.target_language = target_language - config.use_meta_language = False - - # run goose - g = Goose(config=config) - return self.extract(g) - - class TestExtractions(TestExtractionBase): def test_allnewlyrics1(self): @@ -285,16 +151,6 @@ def test_politico(self): fields = ['cleaned_text'] self.runArticleAssertions(article=article, fields=fields) - def test_businessinsider1(self): - article = self.getArticle() - fields = ['cleaned_text'] - self.runArticleAssertions(article=article, fields=fields) - - def test_businessinsider2(self): - article = self.getArticle() - fields = ['cleaned_text'] - self.runArticleAssertions(article=article, fields=fields) - def test_businessinsider3(self): article = self.getArticle() fields = ['cleaned_text'] @@ -355,6 +211,35 @@ def test_okaymarketing(self): fields = ['cleaned_text'] self.runArticleAssertions(article=article, fields=fields) + def test_issue129(self): + article = self.getArticle() + fields = ['cleaned_text'] + self.runArticleAssertions(article=article, fields=fields) + + def test_issue115(self): + # https://github.com/grangier/python-goose/issues/115 + article = self.getArticle() + fields = ['cleaned_text'] + self.runArticleAssertions(article=article, fields=fields) + + +class TestArticleTopNode(TestExtractionBase): + + def test_articlebody_itemprop(self): + article = self.getArticle() + fields = ['cleaned_text'] + self.runArticleAssertions(article=article, fields=fields) + + def test_articlebody_attribute(self): + article = self.getArticle() + fields = ['cleaned_text'] + self.runArticleAssertions(article=article, fields=fields) + + def test_articlebody_tag(self): + article = self.getArticle() + fields = ['cleaned_text'] + self.runArticleAssertions(article=article, fields=fields) + class TestExtractWithUrl(TestExtractionBase): @@ -408,34 +293,3 @@ class TestExtractionsRaw(TestExtractions): def extract(self, instance): article = instance.extract(raw_html=self.getRawHtml()) return article - - -class TestArticleTags(TestExtractionBase): - - def test_tags_kexp(self): - article = self.getArticle() - fields = ['tags'] - self.runArticleAssertions(article=article, fields=fields) - - def test_tags_deadline(self): - article = self.getArticle() - fields = ['tags'] - self.runArticleAssertions(article=article, fields=fields) - - def test_tags_wnyc(self): - article = self.getArticle() - fields = ['tags'] - self.runArticleAssertions(article=article, fields=fields) - - def test_tags_cnet(self): - article = self.getArticle() - fields = ['tags'] - self.runArticleAssertions(article=article, fields=fields) - - def test_tags_abcau(self): - """ - Test ABC Australia page with "topics" tags - """ - article = self.getArticle() - fields = ['tags'] - self.runArticleAssertions(article=article, fields=fields) diff --git a/tests/images.py b/tests/extractors/images.py similarity index 87% rename from tests/images.py rename to tests/extractors/images.py index e0fc2d08..e47a1dde 100644 --- a/tests/images.py +++ b/tests/extractors/images.py @@ -26,13 +26,14 @@ import unittest from base import MockResponse -from extractors import TestExtractionBase +from base import TestExtractionBase from goose.configuration import Configuration -from goose.images.image import Image -from goose.images.image import ImageDetails -from goose.images.utils import ImageUtils +from goose.image import Image +from goose.image import ImageDetails from goose.utils import FileHelper +from goose.utils.images import ImageUtils + CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) @@ -42,7 +43,13 @@ class MockResponseImage(MockResponse): def image_content(self, req): md5_hash = hashlib.md5(req.get_full_url()).hexdigest() current_test = self.cls._get_current_testname() - path = os.path.join(CURRENT_PATH, "data", "images", current_test, md5_hash) + path = os.path.join( + os.path.dirname(CURRENT_PATH), + "data", + "extractors", + "images", + current_test, + md5_hash) path = os.path.abspath(path) f = open(path, 'rb') content = f.read() @@ -51,7 +58,13 @@ def image_content(self, req): def html_content(self, req): current_test = self.cls._get_current_testname() - path = os.path.join(CURRENT_PATH, "data", "images", current_test, "%s.html" % current_test) + path = os.path.join( + os.path.dirname(CURRENT_PATH), + "data", + "extractors", + "images", + current_test, + "%s.html" % current_test) path = os.path.abspath(path) return FileHelper.loadResourceFile(path) @@ -71,8 +84,15 @@ def loadData(self): """\ """ - suite, module, cls, func = self.id().split('.') - path = os.path.join(CURRENT_PATH, "data", module, func, "%s.json" % func) + test, suite, module, cls, func = self.id().split('.') + path = os.path.join( + os.path.dirname(CURRENT_PATH), + "data", + suite, + module, + func, + "%s.json" % func) + path = os.path.abspath(path) content = FileHelper.loadResourceFile(path) self.data = json.loads(content) @@ -157,7 +177,7 @@ def test_opengraph_tag(self): class ImageUtilsTests(unittest.TestCase): def setUp(self): - self.path = 'tests/data/images/test_basic_image/50850547cc7310bc53e30e802c6318f1' + self.path = 'tests/data/extractors/images/test_basic_image/50850547cc7310bc53e30e802c6318f1' self.expected_results = { 'width': 476, 'height': 317, diff --git a/tests/extractors/links.py b/tests/extractors/links.py new file mode 100644 index 00000000..8539465e --- /dev/null +++ b/tests/extractors/links.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from base import TestExtractionBase + + +class TestArticleLinks(TestExtractionBase): + + def test_links(self): + article = self.getArticle() + number_links = len(article.links) + expected_number_links = self.data['expected']['links'] + self.assertEqual(number_links, expected_number_links) diff --git a/tests/extractors/metas.py b/tests/extractors/metas.py new file mode 100644 index 00000000..fd45915a --- /dev/null +++ b/tests/extractors/metas.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from base import TestExtractionBase + + +class TestMetas(TestExtractionBase): + + pass diff --git a/tests/extractors/opengraph.py b/tests/extractors/opengraph.py new file mode 100644 index 00000000..415a784c --- /dev/null +++ b/tests/extractors/opengraph.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from base import TestExtractionBase + + +class TestOpenGraph(TestExtractionBase): + + def test_opengraph(self): + article = self.getArticle() + fields = ['opengraph'] + self.runArticleAssertions(article=article, fields=fields) diff --git a/tests/extractors/publishdate.py b/tests/extractors/publishdate.py new file mode 100644 index 00000000..8d2a13b9 --- /dev/null +++ b/tests/extractors/publishdate.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from base import TestExtractionBase + + +class TestPublishDate(TestExtractionBase): + + def test_publish_date(self): + article = self.getArticle() + self.runArticleAssertions(article=article, fields=['publish_date']) + + def test_publish_date_rnews(self): + article = self.getArticle() + self.runArticleAssertions(article=article, fields=['publish_date']) + + def test_publish_date_article(self): + article = self.getArticle() + self.runArticleAssertions(article=article, fields=['publish_date']) + + def test_publish_date_schema(self): + article = self.getArticle() + self.runArticleAssertions(article=article, fields=['publish_date']) diff --git a/tests/extractors/tags.py b/tests/extractors/tags.py new file mode 100644 index 00000000..22b17129 --- /dev/null +++ b/tests/extractors/tags.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from base import TestExtractionBase + + +class TestArticleTags(TestExtractionBase): + + def assert_tags(self, field, expected_value, result_value): + """\ + + """ + # as we have a set in expected_value and a list in result_value + # make result_value a set + expected_value = set(expected_value) + + # check if both have the same number of items + msg = (u"expected tags set and result tags set" + u"don't have the same number of items") + self.assertEqual(len(result_value), len(expected_value), msg=msg) + + # check if each tag in result_value is in expected_value + for tag in result_value: + self.assertTrue(tag in expected_value) + + def test_tags_kexp(self): + article = self.getArticle() + fields = ['tags'] + self.runArticleAssertions(article=article, fields=fields) + + def test_tags_deadline(self): + article = self.getArticle() + fields = ['tags'] + self.runArticleAssertions(article=article, fields=fields) + + def test_tags_wnyc(self): + article = self.getArticle() + fields = ['tags'] + self.runArticleAssertions(article=article, fields=fields) + + def test_tags_cnet(self): + article = self.getArticle() + fields = ['tags'] + self.runArticleAssertions(article=article, fields=fields) + + def test_tags_abcau(self): + """ + Test ABC Australia page with "topics" tags + """ + article = self.getArticle() + fields = ['tags'] + self.runArticleAssertions(article=article, fields=fields) diff --git a/tests/extractors/title.py b/tests/extractors/title.py new file mode 100644 index 00000000..09170205 --- /dev/null +++ b/tests/extractors/title.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from base import TestExtractionBase + + +class TestTitle(TestExtractionBase): + + def test_title_opengraph(self): + article = self.getArticle() + fields = ['title'] + self.runArticleAssertions(article=article, fields=fields) + + def test_title_empty(self): + article = self.getArticle() + fields = ['title'] + self.runArticleAssertions(article=article, fields=fields) diff --git a/tests/extractors/tweets.py b/tests/extractors/tweets.py new file mode 100644 index 00000000..50300f43 --- /dev/null +++ b/tests/extractors/tweets.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +"""\ +This is a python port of "Goose" orignialy licensed to Gravity.com +under one or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. + +Python port was written by Xavier Grangier for Recrutae + +Gravity.com licenses this file +to you under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from base import TestExtractionBase + + +class TestArticleTweet(TestExtractionBase): + + def test_tweet(self): + article = self.getArticle() + number_tweets = len(article.tweets) + expected_number_tweets = self.data['expected']['tweets'] + self.assertEqual(number_tweets, expected_number_tweets) diff --git a/tests/videos.py b/tests/extractors/videos.py similarity index 69% rename from tests/videos.py rename to tests/extractors/videos.py index 4f18d0f1..10be15ff 100644 --- a/tests/videos.py +++ b/tests/extractors/videos.py @@ -20,32 +20,13 @@ See the License for the specific language governing permissions and limitations under the License. """ -import os -import json - -from .base import MockResponse -from .extractors import TestExtractionBase - -from goose.utils import FileHelper - -CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) - - -class MockResponseVideos(MockResponse): - def content(self, req): - current_test = self.cls._get_current_testname() - path = os.path.join(CURRENT_PATH, "data", "videos", "%s.html" % current_test) - path = os.path.abspath(path) - content = FileHelper.loadResourceFile(path) - return content +from base import TestExtractionBase class ImageExtractionTests(TestExtractionBase): """\ Base Mock test case """ - callback = MockResponseVideos - def assert_movies(self, field, expected_value, result_value): # check if result_value is a list self.assertTrue(isinstance(result_value, list)) @@ -59,16 +40,6 @@ def assert_movies(self, field, expected_value, result_value): r = getattr(video, k) self.assertEqual(r, v) - def loadData(self): - """\ - - """ - suite, module, cls, func = self.id().split('.') - path = os.path.join(CURRENT_PATH, "data", module, "%s.json" % func) - path = os.path.abspath(path) - content = FileHelper.loadResourceFile(path) - self.data = json.loads(content) - def test_embed(self): article = self.getArticle() fields = ['movies']