diff --git a/.gitignore b/.gitignore index 0175dbaad..3a839a5f3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,77 @@ -target -*.iml -out/ -.idea -.classpath +target/ +pom.xml.tag +pom.xml.releaseBackup +pom.xml.versionsBackup +pom.xml.next +release.properties +dependency-reduced-pom.xml +buildNumber.properties +.mvn/timing.properties +# https://github.com/takari/maven-wrapper#usage-without-binary-jar +.mvn/wrapper/maven-wrapper.jar + +# Eclipse m2e generated files +# Eclipse Core .project -.settings/ +# JDT-specific (Eclipse Java Development Tools) +.classpath +.metadata bin/ -.myeclipse +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.settings/ +.loadpath +.recommenders + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# PyDev specific (Python IDE for Eclipse) +*.pydevproject + +# CDT-specific (C/C++ Development Tooling) +.cproject + +# CDT- autotools +.autotools + +# Java annotation processor (APT) +.factorypath + +# PDT-specific (PHP Development Tools) +.buildpath + +# sbteclipse plugin +.target + +# Tern plugin +.tern-project + +# TeXlipse plugin +.texlipse + +# STS (Spring Tool Suite) +.springBeans + +# Code Recommenders +.recommenders/ + +# Annotation Processing +.apt_generated/ +.apt_generated_test/ + +# Scala IDE specific (Scala & Java development for Eclipse) +.cache-main +.scala_dependencies +.worksheet + +# Uncomment this line if you wish to ignore the project description file. +# Typically, this file would be tracked if it contains build/dependency configurations: +#.project diff --git a/LICENSE b/LICENSE index 0cecd8527..37d7aa900 100644 --- a/LICENSE +++ b/LICENSE @@ -176,7 +176,7 @@ recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2013 code4craft + Copyright 2025 code4craft Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/README-zh.md b/README-zh.md index 65d5d1729..c3c4b72ea 100644 --- a/README-zh.md +++ b/README-zh.md @@ -1,9 +1,10 @@ ![logo](http://webmagic.io/images/logo.jpeg) +[![Maven Central](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/badge.svg?subject=Maven%20Central)](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/) +[![License](https://img.shields.io/badge/License-Apache%20License%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0.html) [![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic) - 官方网站[http://webmagic.io/](http://webmagic.io/) >webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。 @@ -38,12 +39,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.7.3 + ${webmagic.version} us.codecraft webmagic-extension - 0.7.3 + ${webmagic.version} ``` diff --git a/README.md b/README.md index 73cb48833..2af81cb22 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,8 @@ [Readme in Chinese](https://github.com/code4craft/webmagic/tree/master/README-zh.md) +[![Maven Central](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/badge.svg?subject=Maven%20Central)](https://maven-badges.herokuapp.com/maven-central/us.codecraft/webmagic-parent/) +[![License](https://img.shields.io/badge/License-Apache%20License%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0.html) [![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic) >A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler. @@ -23,12 +25,12 @@ Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.7.3 + ${webmagic.version} us.codecraft webmagic-extension - 0.7.3 + ${webmagic.version} ``` @@ -48,7 +50,7 @@ WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf ### First crawler: -Write a class implements PageProcessor. For example, I wrote a crawler of github repository infomation. +Write a class implements PageProcessor. For example, I wrote a crawler of github repository information. ```java public class GithubRepoPageProcessor implements PageProcessor { @@ -110,15 +112,15 @@ public class GithubRepo { Documents: [http://webmagic.io/docs/](http://webmagic.io/docs/) -The architecture of webmagic (refered to [Scrapy](http://scrapy.org/)) +The architecture of webmagic (referred to [Scrapy](http://scrapy.org/)) ![image](http://code4craft.github.io/images/posts/webmagic.png) There are more examples in `webmagic-samples` package. -### Lisence: +### License: -Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0) +Licensed under [Apache 2.0 license](http://opensource.org/licenses/Apache-2.0) ### Thanks: diff --git a/pom.xml b/pom.xml index d016d0a92..ffb9a2e86 100644 --- a/pom.xml +++ b/pom.xml @@ -1,17 +1,53 @@ - - us.codecraft - 0.7.3 + 4.0.0 + + org.oxerr + oxerr-parent + 2.2.1 + + us.codecraft + 1.0.3 pom UTF-8 UTF-8 - 1.8 + 11 + 11 + 3.23.1 + 1.5.0 + 4.4 + 2.14.0 + 3.18.0 + 2.0.19.graal + 3.0.13 + 32.0.0-jre + 2.29 + 4.5.13 + 4.4.15 + 3.7.1 + 9.4.13.0 + 2.10.0 + 5.10.2 + 1.10.2 + 2.7.3 + 2.25.3 + 2.0.2-beta + 1.3.0 + 1.2.0 + 12.4 + 4.14.1 + 2.0.4 4.0.0.RELEASE + 0.3.5 - webmagic-parent - webmagic-parent + webmagic + webmagic A crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simply the development of a specific crawler. @@ -33,7 +69,7 @@ scm:git:git@github.com:code4craft/webmagic.git scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git - webmagic-parent-0.6.1 + WebMagic-${project.version} @@ -49,66 +85,119 @@ webmagic-selenium webmagic-saxon webmagic-samples + webmagic-coverage + + + org.apache.logging.log4j + log4j-core + test + + + org.apache.logging.log4j + log4j-slf4j2-impl + test + + + org.junit.jupiter + junit-jupiter-engine + test + + + org.junit.vintage + junit-vintage-engine + test + + + org.junit.platform + junit-platform-launcher + test + + + org.junit.platform + junit-platform-runner + test + + + - - junit - junit - 4.13 - test - org.mockito mockito-all - 1.10.19 + ${mockito-all.version} test org.apache.httpcomponents httpclient - 4.5.12 + ${httpclient.version} org.apache.httpcomponents httpcore - 4.4.13 + ${httpcore.version} + + + org.apache.logging.log4j + log4j-core + ${log4j2.version} + + + org.apache.logging.log4j + log4j-slf4j2-impl + ${log4j2.version} com.google.guava guava - 29.0-jre + ${guava.version} com.jayway.jsonpath json-path - 2.4.0 + ${json-path.version} - org.slf4j - slf4j-api - 1.7.30 + org.junit.jupiter + junit-jupiter-engine + ${junit.version} + + + org.junit.vintage + junit-vintage-engine + ${junit.version} + + + org.junit.platform + junit-platform-launcher + ${junit.platform.version} + + + org.junit.platform + junit-platform-runner + ${junit.platform.version} org.slf4j - slf4j-log4j12 - 1.7.30 + slf4j-api + ${slf4j.version} us.codecraft xsoup - 0.3.1 + 0.3.7 com.alibaba fastjson - 1.2.68 + ${fastjson.version} com.github.dreamhead moco-core - 1.1.0 + ${moco.version} test @@ -117,179 +206,82 @@ - - log4j - log4j - 1.2.17 - org.assertj assertj-core - 3.16.1 + ${assertj.version} test org.apache.commons commons-lang3 - 3.10 + ${commons-lang3.version} - commons-collections - commons-collections - 3.2.2 + org.apache.commons + commons-collections4 + ${commons-collections4.version} - commons-io - commons-io - 2.7 - + commons-io + commons-io + ${commons-io.version} + org.codehaus.groovy groovy-all - 2.4.19 + ${groovy-all.version} org.jruby jruby - 9.2.11.1 - - - org.jsoup - jsoup - 1.10.3 + ${jruby.version} org.python jython - 2.7.2 + ${jython.version} org.seleniumhq.selenium selenium-java - 3.141.59 + ${selenium-java.version} net.sf.saxon Saxon-HE - 10.1 + ${saxon-he.version} net.sourceforge.htmlcleaner htmlcleaner - 2.5 + ${htmlcleaner.version} com.github.detro phantomjsdriver - 1.2.0 + ${phantomjsdriver.version} commons-cli commons-cli - 1.4 + ${commons-cli.version} redis.clients jedis - 2.9.3 + ${jedis.version} - - org.apache.maven.plugins - maven-enforcer-plugin - 3.0.0-M3 - - - enforce-maven - - enforce - - - - - 3.0.5 - - - - - - - - org.apache.maven.plugins - maven-surefire-plugin - 3.0.0-M4 - - 0 - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.8.1 - - ${java.version} - ${java.version} - - - - - - - - - - - - - - - - - - - - - - - - org.apache.maven.plugins - maven-resources-plugin - 3.1.0 - - - org.apache.maven.plugins - maven-jar-plugin - 3.2.0 - - - log4j.xml - - - - - org.apache.maven.plugins - maven-source-plugin - 3.2.1 - - - attach-sources - - jar - - - - org.apache.maven.plugins maven-javadoc-plugin - 3.2.0 - UTF-8 - WebMagic 0.7.3 + WebMagic ${project.version} en_US @@ -313,83 +305,33 @@ - org.apache.maven.plugins - maven-release-plugin - 3.0.0-M1 + org.jacoco + jacoco-maven-plugin + + + + prepare-agent + + + + report + verify + + report + + + + + + com.amashchenko.maven.plugin + gitflow-maven-plugin + + + WebMagic- + + - - - release - - - - - org.apache.maven.plugins - maven-source-plugin - 2.2.1 - - - package - - jar-no-fork - - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 2.9.1 - - - package - - jar - - - - - - - org.apache.maven.plugins - maven-gpg-plugin - 1.6 - - - verify - - sign - - - - - - org.sonatype.plugins - nexus-staging-maven-plugin - 1.6.8 - true - - sonatype-nexus-staging - https://oss.sonatype.org/ - true - - - - - - - sonatype-nexus-snapshots - https://oss.sonatype.org/content/repositories/snapshots/ - - - sonatype-nexus-staging - https://oss.sonatype.org/service/local/staging/deploy/maven2/ - - - - diff --git a/src/site/site.xml b/src/site/site.xml new file mode 100644 index 000000000..b78651960 --- /dev/null +++ b/src/site/site.xml @@ -0,0 +1,23 @@ + + + org.apache.maven.skins + maven-fluido-skin + 1.11.1 + + + + + + + + + true + true + true + pull-right + + + diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 44fb7fa4d..bad11de43 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -1,9 +1,14 @@ - + us.codecraft - webmagic-parent - 0.7.3 + webmagic + 1.0.3 4.0.0 @@ -15,11 +20,6 @@ httpclient - - junit - junit - - org.apache.commons commons-lang3 @@ -46,14 +46,8 @@ - org.slf4j - slf4j-log4j12 - true - - - - commons-collections - commons-collections + org.apache.commons + commons-collections4 @@ -61,11 +55,6 @@ assertj-core - - org.jsoup - jsoup - - commons-io commons-io diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index c11df693c..18486f7a9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -4,7 +4,6 @@ import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Json; import us.codecraft.webmagic.selector.Selectable; -import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.UrlUtils; import java.util.ArrayList; @@ -20,7 +19,7 @@ * {@link #getHtml()} get content of current page
* {@link #putField(String, Object)} save extracted result
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
- * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch
+ * {@link #addTargetRequests(Iterable)} {@link #addTargetRequest(String)} add urls to fetch
* * @author code4crafter@gmail.com
* @see us.codecraft.webmagic.downloader.Downloader @@ -43,21 +42,78 @@ public class Page { private Map> headers; - private int statusCode = HttpConstant.StatusCode.CODE_200; + private int statusCode; - private boolean downloadSuccess = true; + private boolean downloadSuccess; private byte[] bytes; - private List targetRequests = new ArrayList(); + private List targetRequests = new ArrayList<>(); private String charset; - + + /** + * Returns a {@link Page} with {@link #downloadSuccess} is {@code true}, + * and {@link #request} is specified. + * + * @param request the request. + * @since 1.0.2 + */ + public static Page ofSuccess(Request request) { + return new Page(request, true); + } + + /** + * Returns a {@link Page} with {@link #downloadSuccess} is {@code true}, + * and {@link #request} is specified. + * + * @param request the request. + * @since 1.0.2 + */ + public static Page ofFailure(Request request) { + return new Page(request, false); + } + public Page() { } - public static Page fail(){ + /** + * Constructs a {@link Page} with {@link #request} + * and {@link #downloadSuccess} specified. + * + * @param request the request. + * @param downloadSuccess the download success flag. + * @since 1.0.2 + */ + private Page(Request request, boolean downloadSuccess) { + this.request = request; + this.downloadSuccess = downloadSuccess; + } + + /** + * Returns a {@link Page} with {@link #downloadSuccess} is {@code false}. + * + * @return the page. + * @deprecated Use {@link #fail(Request)} instead. + */ + @Deprecated + public static Page fail() { + return fail(null); + } + + /** + * Returns a {@link Page} with {@link #downloadSuccess} is {@code false}, + * and {@link #request} is specified. + * + * @param request the {@link Request}. + * @return the page. + * @since 0.10.0 + * @deprecated Use {@link #ofFailure(Request)} instead. + */ + @Deprecated(since = "1.0.2", forRemoval = true) + public static Page fail(Request request){ Page page = new Page(); + page.setRequest(request); page.setDownloadSuccess(false); return page; } @@ -108,7 +164,8 @@ public Json getJson() { * @deprecated since 0.4.0 * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. */ - public void setHtml(Html html) { + @Deprecated + public void setHtml(Html html) { this.html = html; } @@ -121,14 +178,8 @@ public List getTargetRequests() { * * @param requests requests */ - public void addTargetRequests(List requests) { - for (String s : requests) { - if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { - continue; - } - s = UrlUtils.canonicalizeUrl(s, url.toString()); - targetRequests.add(new Request(s)); - } + public void addTargetRequests(Iterable requests) { + addTargetRequests(requests, 0); // Default priority is 0 } /** @@ -137,14 +188,33 @@ public void addTargetRequests(List requests) { * @param requests requests * @param priority priority */ - public void addTargetRequests(List requests, long priority) { - for (String s : requests) { - if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { - continue; - } - s = UrlUtils.canonicalizeUrl(s, url.toString()); - targetRequests.add(new Request(s).setPriority(priority)); + public void addTargetRequests(Iterable requests, long priority) { + if(requests == null) { + return; + } + + for (String req : requests) { + addRequestIfValid(req, priority); + } + } + + /** + * Helper method to add a request if it's valid. + * + * @param url URL to add + * @param priority Priority for the URL + */ + private void addRequestIfValid(String url, long priority) { + if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) { + return; + } + + String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString()); + Request req = new Request(canonicalizedUrl); + if(priority > 0) { + req.setPriority(priority); } + targetRequests.add(req); } /** diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 5c26d20dc..a59b20637 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,12 +1,14 @@ package us.codecraft.webmagic; -import us.codecraft.webmagic.model.HttpRequestBody; -import us.codecraft.webmagic.utils.Experimental; - import java.io.Serializable; +import java.util.Collections; import java.util.HashMap; import java.util.Map; +import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.model.HttpRequestBody; +import us.codecraft.webmagic.utils.Experimental; + /** * Object contains url to crawl.
* It contains some additional information.
@@ -26,10 +28,15 @@ public class Request implements Serializable { private HttpRequestBody requestBody; + /** + * this req use this downloader + */ + private Downloader downloader; + /** * Store additional information in extras. */ - private Map extras; + private Map extras = new HashMap<>(); /** * cookies for current url, if not set use Site's cookies @@ -87,9 +94,6 @@ public T getExtra(String key) { } public Request putExtra(String key, T value) { - if (extras == null) { - extras = new HashMap(); - } extras.put(key, value); return this; } @@ -99,11 +103,11 @@ public String getUrl() { } public Map getExtras() { - return extras; + return Collections.unmodifiableMap(extras); } public Request setExtras(Map extras) { - this.extras = extras; + this.extras.putAll(extras); return this; } @@ -175,6 +179,14 @@ public boolean isBinaryContent() { return binaryContent; } + public Downloader getDownloader() { + return downloader; + } + + public void setDownloader(Downloader downloader) { + this.downloader = downloader; + } + public Request setBinaryContent(boolean binaryContent) { this.binaryContent = binaryContent; return this; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 72cc7d058..230337756 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -1,8 +1,13 @@ package us.codecraft.webmagic; -import us.codecraft.webmagic.utils.HttpConstant; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; +import java.util.UUID; -import java.util.*; +import us.codecraft.webmagic.utils.HttpConstant; /** * Object contains setting for crawler.
@@ -23,6 +28,8 @@ public class Site { private String charset; + private String defaultCharset; + private int sleepTime = 5000; private int retryTimes = 0; @@ -163,6 +170,30 @@ public String getCharset() { return charset; } + /** + * Set default charset of page. + * + * When charset detect failed, use this default charset. + * + * @param defaultCharset the default charset + * @return this + * @since 0.9.0 + */ + public Site setDefaultCharset(String defaultCharset) { + this.defaultCharset = defaultCharset; + return this; + } + + /** + * The default charset if charset detected failed. + * + * @return the defulat charset + * @since 0.9.0 + */ + public String getDefaultCharset() { + return defaultCharset; + } + public int getTimeOut() { return timeOut; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 62c989f1d..a71166421 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -1,6 +1,17 @@ package us.codecraft.webmagic; -import org.apache.commons.collections.CollectionUtils; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Date; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.SerializationUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -17,16 +28,6 @@ import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.WMCollections; -import java.io.Closeable; -import java.io.IOException; -import java.util.*; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.locks.Condition; -import java.util.concurrent.locks.ReentrantLock; - /** * Entrance of a crawler.
* A spider contains four modules: Downloader, Scheduler, PageProcessor and @@ -71,9 +72,9 @@ public class Spider implements Runnable, Task { protected Site site; protected String uuid; - - protected Scheduler scheduler = new QueueScheduler(); - + + protected SpiderScheduler scheduler; + protected Logger logger = LoggerFactory.getLogger(getClass()); protected CountableThreadPool threadPool; @@ -84,7 +85,7 @@ public class Spider implements Runnable, Task { protected AtomicInteger stat = new AtomicInteger(STAT_INIT); - protected boolean exitWhenComplete = true; + protected volatile boolean exitWhenComplete = true; protected final static int STAT_INIT = 0; @@ -96,17 +97,13 @@ public class Spider implements Runnable, Task { protected boolean destroyWhenExit = true; - private ReentrantLock newUrlLock = new ReentrantLock(); - - private Condition newUrlCondition = newUrlLock.newCondition(); - private List spiderListeners; private final AtomicLong pageCount = new AtomicLong(0); private Date startTime; - private int emptySleepTime = 30000; + private long emptySleepTime = 30000; /** * create a spider with pageProcessor. @@ -127,6 +124,7 @@ public static Spider create(PageProcessor pageProcessor) { public Spider(PageProcessor pageProcessor) { this.pageProcessor = pageProcessor; this.site = pageProcessor.getSite(); + this.scheduler = new SpiderScheduler(new QueueScheduler()); } /** @@ -182,15 +180,15 @@ public Spider scheduler(Scheduler scheduler) { /** * set scheduler for Spider * - * @param scheduler scheduler + * @param updateScheduler scheduler * @return this * @see Scheduler * @since 0.2.1 */ - public Spider setScheduler(Scheduler scheduler) { + public Spider setScheduler(Scheduler updateScheduler) { checkIfRunning(); - Scheduler oldScheduler = this.scheduler; - this.scheduler = scheduler; + Scheduler oldScheduler = scheduler.getScheduler(); + scheduler.setScheduler(updateScheduler); if (oldScheduler != null) { Request request; while ((request = oldScheduler.poll(this)) != null) { @@ -208,6 +206,7 @@ public Spider setScheduler(Scheduler scheduler) { * @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline) * @deprecated */ + @Deprecated public Spider pipeline(Pipeline pipeline) { return addPipeline(pipeline); } @@ -258,6 +257,7 @@ public Spider clearPipeline() { * @see #setDownloader(us.codecraft.webmagic.downloader.Downloader) * @deprecated */ + @Deprecated public Spider downloader(Downloader downloader) { return setDownloader(downloader); } @@ -303,32 +303,54 @@ protected void initComponent() { public void run() { checkRunningStat(); initComponent(); - logger.info("Spider {} started!",getUUID()); + logger.info("Spider {} started!", getUUID()); + // interrupt won't be necessarily detected while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) { - final Request request = scheduler.poll(this); - if (request == null) { - if (threadPool.getThreadAlive() == 0 && exitWhenComplete) { - break; - } - // wait until new url added - waitNewUrl(); - } else { - threadPool.execute(new Runnable() { - @Override - public void run() { - try { - processRequest(request); - onSuccess(request); - } catch (Exception e) { - onError(request); - logger.error("process request " + request + " error", e); - } finally { - pageCount.incrementAndGet(); - signalNewUrl(); + Request poll = scheduler.poll(this); + if (poll == null) { + if (threadPool.getThreadAlive() == 0) { + //no alive thread anymore , try again + poll = scheduler.poll(this); + if (poll == null) { + if (exitWhenComplete) { + break; + } else { + // wait + try { + Thread.sleep(emptySleepTime); + continue; + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + break; + } } } - }); + } else { + // wait until new url added, + if (scheduler.waitNewUrl(threadPool, emptySleepTime)) { + // if interrupted + break; + } + continue; + } } + final Request request = poll; + //this may swallow the interruption + threadPool.execute(new Runnable() { + @Override + public void run() { + try { + processRequest(request); + onSuccess(request); + } catch (Exception e) { + onError(request, e); + logger.error("process request " + request + " error", e); + } finally { + pageCount.incrementAndGet(); + scheduler.signalNewUrl(); + } + } + }); } stat.set(STAT_STOPPED); // release some resources @@ -338,10 +360,19 @@ public void run() { logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get()); } + /** + * @deprecated Use {@link #onError(Request, Exception)} instead. + */ + @Deprecated protected void onError(Request request) { + } + + protected void onError(Request request, Exception e) { + this.onError(request); + if (CollectionUtils.isNotEmpty(spiderListeners)) { for (SpiderListener spiderListener : spiderListeners) { - spiderListener.onError(request); + spiderListener.onError(request, e); } } } @@ -401,7 +432,12 @@ public void test(String... urls) { } private void processRequest(Request request) { - Page page = downloader.download(request, this); + Page page; + if (null != request.getDownloader()){ + page = request.getDownloader().download(request,this); + }else { + page = downloader.download(request, this); + } if (page.isDownloadSuccess()){ onDownloadSuccess(request, page); } else { @@ -422,7 +458,6 @@ private void onDownloadSuccess(Request request, Page page) { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } sleep(site.getSleepTime()); - return; } private void onDownloaderFail(Request request) { @@ -453,6 +488,7 @@ protected void sleep(int time) { Thread.sleep(time); } catch (InterruptedException e) { logger.error("Thread interrupted when sleep",e); + Thread.currentThread().interrupt(); } } @@ -493,7 +529,7 @@ public Spider addUrl(String... urls) { for (String url : urls) { addRequest(new Request(url)); } - signalNewUrl(); + scheduler.signalNewUrl(); return this; } @@ -545,34 +581,10 @@ public Spider addRequest(Request... requests) { for (Request request : requests) { addRequest(request); } - signalNewUrl(); + scheduler.signalNewUrl(); return this; } - private void waitNewUrl() { - newUrlLock.lock(); - try { - //double check - if (threadPool.getThreadAlive() == 0 && exitWhenComplete) { - return; - } - newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS); - } catch (InterruptedException e) { - logger.warn("waitNewUrl - interrupted, error {}", e); - } finally { - newUrlLock.unlock(); - } - } - - private void signalNewUrl() { - try { - newUrlLock.lock(); - newUrlCondition.signalAll(); - } finally { - newUrlLock.unlock(); - } - } - public void start() { runAsync(); } @@ -585,6 +597,13 @@ public void stop() { } } + /** + * Stop when all tasks in the queue are completed and all worker threads are also completed + */ + public void stopWhenComplete(){ + this.exitWhenComplete = true; + } + /** * start with more than one threads * @@ -748,15 +767,20 @@ public Date getStartTime() { } public Scheduler getScheduler() { - return scheduler; + return scheduler.getScheduler(); } /** * Set wait time when no url is polled.

* * @param emptySleepTime In MILLISECONDS. + * @return this */ - public void setEmptySleepTime(int emptySleepTime) { + public Spider setEmptySleepTime(long emptySleepTime) { + if(emptySleepTime<=0){ + throw new IllegalArgumentException("emptySleepTime should be more than zero!"); + } this.emptySleepTime = emptySleepTime; + return this; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java index 067818038..b55ef3d7f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java @@ -8,7 +8,17 @@ */ public interface SpiderListener { - public void onSuccess(Request request); + void onSuccess(Request request); + + /** + * @deprecated Use {@link #onError(Request, Exception)} instead. + */ + @Deprecated + default void onError(Request request) { + } + + default void onError(Request request, Exception e) { + this.onError(request); + } - public void onError(Request request); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java new file mode 100644 index 000000000..1005bac88 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/SpiderScheduler.java @@ -0,0 +1,59 @@ +package us.codecraft.webmagic; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; + +import us.codecraft.webmagic.scheduler.Scheduler; +import us.codecraft.webmagic.thread.CountableThreadPool; + +public class SpiderScheduler { + private Scheduler scheduler; + private final ReentrantLock newUrlLock = new ReentrantLock(); + private final Condition newUrlCondition = newUrlLock.newCondition(); + + public SpiderScheduler(Scheduler scheduler) { + this.scheduler = scheduler; + } + + public Scheduler getScheduler() { + return scheduler; + } + + public void setScheduler(Scheduler scheduler) { + this.scheduler = scheduler; + } + + public Request poll(Spider spider) { + return scheduler.poll(spider); + } + + public void push(Request request, Spider spider) { + scheduler.push(request, spider); + } + + public boolean waitNewUrl(CountableThreadPool threadPool, long emptySleepTime) { + newUrlLock.lock(); + try { + if (threadPool.getThreadAlive() == 0) { + return false; + } + newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS); + return false; + } catch (InterruptedException e) { + return true; + } finally { + newUrlLock.unlock(); + } + } + + public void signalNewUrl() { + try { + newUrlLock.lock(); + newUrlCondition.signalAll(); + } finally { + newUrlLock.unlock(); + } + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index c27292d09..6a400e321 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -3,6 +3,7 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Html; /** @@ -26,7 +27,7 @@ public Html download(String url) { /** * A simple method to download a url. * - * @param url url + * @param url url * @param charset charset * @return html */ @@ -35,10 +36,62 @@ public Html download(String url, String charset) { return (Html) page.getHtml(); } + /** + * @param request the {@link Request}. + * @deprecated Use {@link #onSuccess(Page, Task)} instead. + */ + @Deprecated protected void onSuccess(Request request) { } + /** + * @param request the {@link Request}. + * @param task the {@link Task}. + * @since 0.7.6 + * @deprecated Use {@link #onSuccess(Page, Task)} instead. + */ + @Deprecated + protected void onSuccess(Request request, Task task) { + this.onSuccess(request); + } + + /** + * @param page the {@link Page}. + * @param task the {@link Task}. + * @since 0.10.0 + */ + protected void onSuccess(Page page, Task task) { + this.onSuccess(page.getRequest(), task); + } + + /** + * @param request the {@link Request}. + * @deprecated Use {@link #onError(Page, Task, Throwable)} instead. + */ + @Deprecated protected void onError(Request request) { } + /** + * @param request the {@link Request}. + * @param task the {@link Task}. + * @param e the exception. + * @since 0.7.6 + * @deprecated Use {@link #onError(Page, Task, Throwable)} instead. + */ + @Deprecated + protected void onError(Request request, Task task, Throwable e) { + this.onError(request); + } + + /** + * @param page the {@link Page}. + * @param task the {@link Task}. + * @param e the exception. + * @since 0.10.0 + */ + protected void onError(Page page, Task task, Throwable e) { + this.onError(page.getRequest(), task, e); + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 24889c88b..6fdae38d8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,12 +1,18 @@ package us.codecraft.webmagic.downloader; +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; + import org.apache.commons.io.IOUtils; +import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; + import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -17,12 +23,6 @@ import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpClientUtils; -import java.io.IOException; -import java.nio.charset.Charset; -import java.util.HashMap; -import java.util.Map; - - /** * The http downloader based on HttpClient. * @@ -31,14 +31,12 @@ */ public class HttpClientDownloader extends AbstractDownloader { - private Logger logger = LoggerFactory.getLogger(getClass()); - private final Map httpClients = new HashMap(); private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); - + private ProxyProvider proxyProvider; private boolean responseHeader = true; @@ -76,18 +74,17 @@ public Page download(Request request, Task task) { } CloseableHttpResponse httpResponse = null; CloseableHttpClient httpClient = getHttpClient(task.getSite()); - Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null; + Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); - Page page = Page.fail(); + Page page = null; try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); - onSuccess(request); - logger.info("downloading page success {}", request.getUrl()); + onSuccess(page, task); return page; } catch (IOException e) { - logger.warn("download page {} error", request.getUrl(), e); - onError(request); + page = Page.ofFailure(request); + onError(page, task, e); return page; } finally { if (httpResponse != null) { @@ -106,13 +103,14 @@ public void setThread(int thread) { } protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { - byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); - String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); - Page page = new Page(); + HttpEntity entity = httpResponse.getEntity(); + byte[] bytes = entity != null ? IOUtils.toByteArray(entity.getContent()) : new byte[0]; + String contentType = entity != null && entity.getContentType() != null ? entity.getContentType().getValue() : null; + Page page = Page.ofSuccess(request); page.setBytes(bytes); - if (!request.isBinaryContent()){ + if (!request.isBinaryContent()) { if (charset == null) { - charset = getHtmlCharset(contentType, bytes); + charset = getHtmlCharset(contentType, bytes, task); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); @@ -120,18 +118,16 @@ protected Page handleResponse(Request request, String charset, HttpResponse http page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); - page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } return page; } - private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException { + private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException { String charset = CharsetUtils.detectCharset(contentType, contentBytes); if (charset == null) { - charset = Charset.defaultCharset().name(); - logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); + charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name); } return charset; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index ee94581ad..94b00cc73 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -1,16 +1,5 @@ package us.codecraft.webmagic.downloader; -import java.io.IOException; -import java.security.KeyManagementException; -import java.security.NoSuchAlgorithmException; -import java.security.cert.CertificateException; -import java.security.cert.X509Certificate; -import java.util.Map; - -import javax.net.ssl.SSLContext; -import javax.net.ssl.TrustManager; -import javax.net.ssl.X509TrustManager; - import org.apache.commons.lang3.JavaVersion; import org.apache.commons.lang3.SystemUtils; import org.apache.http.HttpException; @@ -22,28 +11,32 @@ import org.apache.http.config.SocketConfig; import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; -import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; -import org.apache.http.impl.client.BasicCookieStore; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; -import org.apache.http.impl.client.HttpClientBuilder; -import org.apache.http.impl.client.HttpClients; +import org.apache.http.impl.client.*; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.protocol.HttpContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import us.codecraft.webmagic.Site; +import javax.net.ssl.SSLContext; +import javax.net.ssl.TrustManager; +import javax.net.ssl.X509TrustManager; +import java.io.IOException; +import java.security.KeyManagementException; +import java.security.NoSuchAlgorithmException; +import java.security.cert.CertificateException; +import java.security.cert.X509Certificate; +import java.util.Map; + /** * @author code4crafter@gmail.com
* @since 0.4.0 */ public class HttpClientGenerator { - private transient Logger logger = LoggerFactory.getLogger(getClass()); + private Logger logger = LoggerFactory.getLogger(getClass()); private PoolingHttpClientConnectionManager connectionManager; @@ -61,21 +54,20 @@ private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { SSLContext sslContext = createIgnoreVerifySSL(); String[] supportedProtocols; if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_11)) { - supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3" }; + supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"}; } else { - supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2" }; + supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"}; } logger.debug("supportedProtocols: {}", String.join(", ", supportedProtocols)); return new SSLConnectionSocketFactory(sslContext, supportedProtocols, null, - new DefaultHostnameVerifier()); // 优先绕过安全证书 - } catch (KeyManagementException e) { - logger.error("ssl connection fail", e); - } catch (NoSuchAlgorithmException e) { + //不进行主机校验 + (host, sslSession) -> true); // 优先绕过安全证书 + } catch (KeyManagementException | NoSuchAlgorithmException e) { logger.error("ssl connection fail", e); } return SSLConnectionSocketFactory.getSocketFactory(); - } + } private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 @@ -96,10 +88,10 @@ public X509Certificate[] getAcceptedIssuers() { }; - SSLContext sc = SSLContext.getInstance("SSLv3"); - sc.init(null, new TrustManager[] { trustManager }, null); + SSLContext sc = SSLContext.getInstance("TLS"); + sc.init(null, new TrustManager[]{trustManager}, null); return sc; - } + } public HttpClientGenerator setPoolSize(int poolSize) { connectionManager.setMaxTotal(poolSize); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index 4baaf4a4a..168467866 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -42,7 +42,9 @@ private HttpClientContext convertHttpClientContext(Request request, Site site, P HttpClientContext httpContext = new HttpClientContext(); if (proxy != null && proxy.getUsername() != null) { AuthState authState = new AuthState(); - authState.update(new BasicScheme(ChallengeState.PROXY), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); + BasicScheme proxyAuthScheme = new BasicScheme(ChallengeState.PROXY); + UsernamePasswordCredentials proxyCredentials = new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()); + authState.update(proxyAuthScheme, proxyCredentials); httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); } if (request.getCookies() != null && !request.getCookies().isEmpty()) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java index 1fb125c72..3d79b96a9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java @@ -4,13 +4,16 @@ import us.codecraft.webmagic.Site; /** - * Interface to be implemented to customize a crawler.
- *
+ * Interface to be implemented to customize a crawler. + * + *

* In PageProcessor, you can customize: - *
- * start urls and other settings in {@link Site}
- * how the urls to fetch are detected
- * how the data are extracted and stored
+ *

+ *
    + *
  • start URLs and other settings in {@link Site}
  • + *
  • how the URLs to fetch are detected
  • + *
  • how the data are extracted and stored
  • + *
* * @author code4crafter@gmail.com
* @see Site @@ -20,17 +23,20 @@ public interface PageProcessor { /** - * process the page, extract urls to fetch, extract the data and store + * Processes the page, extract URLs to fetch, extract the data and store. * * @param page page */ - public void process(Page page); + void process(Page page); /** - * get the site settings + * Returns the site settings. * * @return site * @see Site */ - public Site getSite(); + default Site getSite() { + return Site.me(); + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java index 0cef4ed42..8eab4d6de 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.proxy; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; /** @@ -23,7 +24,23 @@ public interface ProxyProvider { * Get a proxy for task by some strategy. * @param task the download task * @return proxy + * @deprecated Use {@link #getProxy(Request, Task)} instead. */ - Proxy getProxy(Task task); + @Deprecated + default Proxy getProxy(Task task) { + throw new UnsupportedOperationException(); + } + + /** + * Returns a proxy for the request. + * + * @param request the request + * @param task the download task + * @return proxy + * @since 0.9.0 + */ + default Proxy getProxy(Request request, Task task) { + return this.getProxy(task); + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java index ddef6a88c..f4c3f73bb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.proxy; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import java.util.ArrayList; @@ -44,7 +45,7 @@ public void returnProxy(Proxy proxy, Page page, Task task) { } @Override - public Proxy getProxy(Task task) { + public Proxy getProxy(Request request, Task task) { return proxies.get(incrForLoop()); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index f9ad0e98f..19d3bc732 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -1,26 +1,51 @@ package us.codecraft.webmagic.scheduler; -import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Task; - import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; /** * Basic Scheduler implementation.
* Store urls to fetch in LinkedBlockingQueue and remove duplicate urls by HashMap. * + * Note: if you use this {@link QueueScheduler} + * with {@link Site#getCycleRetryTimes()} enabled, you may encountered dead-lock + * when the queue is full. + * * @author code4crafter@gmail.com
* @since 0.1.0 */ public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler { - private BlockingQueue queue = new LinkedBlockingQueue(); + private final BlockingQueue queue; + + public QueueScheduler() { + this.queue = new LinkedBlockingQueue<>(); + } + + /** + * Creates a {@code QueueScheduler} with the given (fixed) capacity. + * + * @param capacity the capacity of this queue, + * see {@link LinkedBlockingQueue#LinkedBlockingQueue(int)} + * @since 0.8.0 + */ + public QueueScheduler(int capacity) { + this.queue = new LinkedBlockingQueue<>(capacity); + } @Override public void pushWhenNoDuplicate(Request request, Task task) { - queue.add(request); + logger.trace("Remaining capacity: {}", this.queue.remainingCapacity()); + + try { + queue.put(request); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java index e2bb55215..1fb35f1a8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java @@ -1,10 +1,11 @@ package us.codecraft.webmagic.selector; -import org.apache.commons.collections.CollectionUtils; import java.util.ArrayList; import java.util.List; +import org.apache.commons.collections4.CollectionUtils; + /** * @author code4crafer@gmail.com * @since 0.5.2 @@ -55,11 +56,12 @@ public Selectable jsonPath(String jsonPath) { @Override public String get() { - if (CollectionUtils.isNotEmpty(all())) { - return all().get(0); - } else { - return null; - } + List sourceTexts = all(); + if (CollectionUtils.isNotEmpty(sourceTexts)) { + return sourceTexts.get(0); + } + return null; + } @Override @@ -91,8 +93,9 @@ public Selectable replace(String regex, String replacement) { } public String getFirstSourceText() { - if (getSourceTexts() != null && getSourceTexts().size() > 0) { - return getSourceTexts().get(0); + List sourceTexts = getSourceTexts(); + if (CollectionUtils.isNotEmpty(sourceTexts)) { + return sourceTexts.get(0); } return null; } @@ -104,6 +107,6 @@ public String toString() { @Override public boolean match() { - return getSourceTexts() != null && getSourceTexts().size() > 0; + return CollectionUtils.isNotEmpty(getSourceTexts()); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java index bbc7217ab..6001767d8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -1,7 +1,9 @@ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import us.codecraft.webmagic.utils.BaseSelectorUtils; import java.util.ArrayList; import java.util.List; @@ -11,11 +13,17 @@ * @since 0.3.0 */ public abstract class BaseElementSelector implements Selector, ElementSelector { + private Document parse(String text) { + // Jsoup could not parse or tag directly + // https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag + text = BaseSelectorUtils.preParse(text); + return Jsoup.parse(text); + } @Override public String select(String text) { if (text != null) { - return select(Jsoup.parse(text)); + return select(parse(text)); } return null; } @@ -23,7 +31,7 @@ public String select(String text) { @Override public List selectList(String text) { if (text != null) { - return selectList(Jsoup.parse(text)); + return selectList(parse(text)); } else { return new ArrayList(); } @@ -31,14 +39,14 @@ public List selectList(String text) { public Element selectElement(String text) { if (text != null) { - return selectElement(Jsoup.parse(text)); + return selectElement(parse(text)); } return null; } public List selectElements(String text) { if (text != null) { - return selectElements(Jsoup.parse(text)); + return selectElements(parse(text)); } else { return new ArrayList(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index 6a638dbff..cfe55472a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -1,14 +1,14 @@ package us.codecraft.webmagic.selector; -import org.apache.commons.collections.CollectionUtils; + +import java.util.ArrayList; +import java.util.List; +import org.apache.commons.collections4.CollectionUtils; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.select.Elements; -import java.util.ArrayList; -import java.util.List; - /** * CSS selector. Based on Jsoup. * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index c063b4825..74ea718e5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -26,12 +26,16 @@ protected List getElements() { return elements; } - @Override public Selectable smartContent() { SmartContentSelector smartContentSelector = Selectors.smartContent(); return select(smartContentSelector, getSourceTexts()); } + public Selectable smartContent(int threshold) { + SmartContentSelector smartContentSelector = Selectors.smartContent(threshold); + return select(smartContentSelector, getSourceTexts()); + } + @Override public Selectable links() { return selectElements(new LinksSelector()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java index f5c0baeb5..aa9a903f7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java @@ -1,11 +1,11 @@ package us.codecraft.webmagic.selector; -import com.alibaba.fastjson.JSON; -import com.jayway.jsonpath.JsonPath; import java.util.ArrayList; import java.util.List; import java.util.Map; +import com.alibaba.fastjson.JSON; +import com.jayway.jsonpath.JsonPath; /** * JsonPath selector.
@@ -16,15 +16,20 @@ */ public class JsonPathSelector implements Selector { - private String jsonPathStr; + private final String jsonPathStr; - private JsonPath jsonPath; + private final JsonPath jsonPath; public JsonPathSelector(String jsonPathStr) { this.jsonPathStr = jsonPathStr; this.jsonPath = JsonPath.compile(this.jsonPathStr); } + @SuppressWarnings("unused") + public String getJsonPathStr() { + return jsonPathStr; + } + @Override public String select(String text) { Object object = jsonPath.read(text); @@ -32,8 +37,8 @@ public String select(String text) { return null; } if (object instanceof List) { - List list = (List) object; - if (list != null && list.size() > 0) { + List list = (List) object; + if (list.size() > 0) { return toString(list.iterator().next()); } } @@ -49,8 +54,9 @@ private String toString(Object object) { } @Override + @SuppressWarnings("unchecked") public List selectList(String text) { - List list = new ArrayList(); + List list = new ArrayList<>(); Object object = jsonPath.read(text); if (object == null) { return list; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java index 5296a74bd..2dafe8ee9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/LinksSelector.java @@ -1,12 +1,12 @@ package us.codecraft.webmagic.selector; -import org.jsoup.helper.StringUtil; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; - import java.util.ArrayList; import java.util.List; +import org.apache.commons.lang3.StringUtils; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + /** * Links selector based on jsoup. Use absolute url.
* @@ -23,9 +23,9 @@ public String select(Element element) { @Override public List selectList(Element element) { Elements elements = element.select("a"); - List links = new ArrayList(elements.size()); + List links = new ArrayList<>(elements.size()); for (Element element0 : elements) { - if (!StringUtil.isBlank(element0.baseUri())) { + if (StringUtils.isNotBlank(element0.baseUri())) { links.add(element0.attr("abs:href")); } else { links.add(element0.attr("href")); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index c78f6791b..18258e9a7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -42,11 +42,6 @@ public Selectable xpath(String xpath) { throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); } - @Override - public Selectable smartContent() { - throw new UnsupportedOperationException("Smart content can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); - } - @Override public Selectable links() { throw new UnsupportedOperationException("Links can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 9412cfce4..a4d5fdb94 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -51,14 +51,6 @@ public interface Selectable { * @return new Selectable after extract */ public Selectable css(String selector, String attrName); - - /** - * select smart content with ReadAbility algorithm - * - * @return content - */ - public Selectable smartContent(); - /** * select all links * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java index 7cd68c1d6..3600896e2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java @@ -20,6 +20,10 @@ public static SmartContentSelector smartContent() { return new SmartContentSelector(); } + public static SmartContentSelector smartContent(int threshold) { + return new SmartContentSelector(threshold); + } + public static CssSelector $(String expr) { return new CssSelector(expr); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java index ff8e26998..c8816510b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java @@ -16,9 +16,15 @@ @Experimental public class SmartContentSelector implements Selector { + private int threshold = 86; + public SmartContentSelector() { } + public SmartContentSelector(int threshold) { + this.threshold = threshold; + } + @Override public String select(String html) { html = html.replaceAll("(?is)", ""); @@ -29,7 +35,6 @@ public String select(String html) { html = html.replaceAll("(?is)<.*?>", ""); List lines; int blocksWidth =3; - int threshold =86; int start; int end; StringBuilder text = new StringBuilder(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java index 8a980a50d..4fa14699e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java @@ -1,12 +1,12 @@ package us.codecraft.webmagic.selector; -import org.apache.commons.collections.CollectionUtils; + +import java.util.List; +import org.apache.commons.collections4.CollectionUtils; import org.jsoup.nodes.Element; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; -import java.util.List; - /** * XPath selector based on Xsoup.
* diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java new file mode 100644 index 000000000..04c0651c3 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.utils; + +/** + * @author hooy + */ +public class BaseSelectorUtils { + + /** + * Jsoup/HtmlCleaner could not parse "tr" or "td" tag directly + * https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag + * + * @param text - the html string + * @return text + */ + public static String preParse(String text) { + if (((text.startsWith("") || text.startsWith("")) + || ((text.startsWith("") || text.startsWith(""))) { + text = "" + text + "
"; + } + return text; + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java index ccf00a466..63bb4c110 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java @@ -21,6 +21,10 @@ public abstract class CharsetUtils { private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class); + private CharsetUtils() { + throw new AssertionError("No us.codecraft.webmagic.utils.CharsetUtils instances for you!"); + } + public static String detectCharset(String contentType, byte[] contentBytes) throws IOException { String charset; // charset diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java index 55e185105..fbeb8ed3b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java @@ -6,12 +6,6 @@ public abstract class NumberUtils { public static int compareLong(long o1, long o2) { - if (o1 < o2) { - return -1; - } else if (o1 == o2) { - return 0; - } else { - return 1; - } + return Long.compare(o1, o2); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index c61483a39..ea317c405 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -116,6 +116,10 @@ public static List convertToUrls(Collection requests) { private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)", Pattern.CASE_INSENSITIVE); public static String getCharset(String contentType) { + if (contentType == null) { + return null; + } + Matcher matcher = patternForCharset.matcher(contentType); if (matcher.find()) { String charset = matcher.group(1); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java index 23e1644ce..a2ca5afd0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java @@ -21,10 +21,10 @@ public static Set newHashSet(T... t){ } public static List newArrayList(T... t){ - List set = new ArrayList(t.length); + List list = new ArrayList(t.length); for (T t1 : t) { - set.add(t1); + list.add(t1); } - return set; + return list; } } diff --git a/webmagic-core/src/main/resources/log4j.xml b/webmagic-core/src/main/resources/log4j.xml deleted file mode 100644 index c2b5a2f53..000000000 --- a/webmagic-core/src/main/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java index c7e4943d9..b8f699a6f 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java @@ -1,9 +1,13 @@ package us.codecraft.webmagic; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Collections; +import java.util.Map; + import org.junit.Test; -import us.codecraft.webmagic.utils.HttpConstant; -import static org.assertj.core.api.Assertions.assertThat; +import us.codecraft.webmagic.utils.HttpConstant; /** * @author code4crafter@gmail.com @@ -22,4 +26,28 @@ public void testEqualsAndHashCode() throws Exception { assertThat(requestA).isNotEqualTo(requestB); assertThat(requestA.hashCode()).isNotEqualTo(requestB.hashCode()); } + + @Test + public void testSetExtras() { + Request request = new Request(); + Map extras = Collections.singletonMap("a", "1"); + request.setExtras(extras); + request.putExtra("b", "2"); + assertThat(request.getExtra("a")).isEqualTo("1"); + assertThat(request.getExtra("b")).isEqualTo("2"); + } + + @Test + public void testGetExtras() { + Request request = new Request(); + request.putExtra("a", "1"); + assertThat(request.getExtras()).containsEntry("a", "1"); + } + + @Test(expected = UnsupportedOperationException.class) + public void testGetExtrasShouldBeUnmodifiable() { + Request request = new Request(); + request.getExtras().put("a", "1"); + } + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java new file mode 100644 index 000000000..47c4fcc14 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java @@ -0,0 +1,40 @@ +package us.codecraft.webmagic; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import org.junit.Test; + +public class SiteTest { + + @Test + public void test() { + Site site = Site.me().setDefaultCharset(StandardCharsets.UTF_8.name()); + assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset()); + } + + @Test + public void addCookieTest(){ + Site site=Site.me().setDefaultCharset(StandardCharsets.UTF_8.name()); + site.addCookie("cookieDefault","cookie-webmagicDefault"); + String firstDomain="example.com"; + String secondDomain="exampleCopy.com"; + site.addCookie(firstDomain, "cookie", "cookie-webmagic"); + site.addCookie(firstDomain, "cookieCopy", "cookie-webmagicCopy"); + site.addCookie(secondDomain, "cookie", "cookie-webmagic"); + Map> allCookies = site.getAllCookies(); + List domains=new ArrayList<>(); + for(String key : allCookies.keySet()){ + domains.add(key); + } + assertEquals("cookie-webmagic", allCookies.get(firstDomain).get("cookie")); + assertEquals("cookie-webmagicCopy", allCookies.get(firstDomain).get("cookieCopy")); + assertEquals("cookie-webmagic", allCookies.get(secondDomain).get("cookie")); + assertEquals(2, domains.size()); + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index ece060003..1ff7b4dd7 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -1,9 +1,10 @@ package us.codecraft.webmagic.downloader; -import com.github.dreamhead.moco.HttpServer; -import com.github.dreamhead.moco.Runnable; -import com.github.dreamhead.moco.Runner; -import org.apache.commons.collections.map.HashedMap; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.Map; +import org.apache.commons.collections4.map.HashedMap; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpUriRequest; @@ -11,6 +12,9 @@ import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.junit.Test; +import com.github.dreamhead.moco.HttpServer; +import com.github.dreamhead.moco.Runnable; +import com.github.dreamhead.moco.Runner; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -21,14 +25,22 @@ import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpConstant; - -import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.util.Map; - -import static com.github.dreamhead.moco.Moco.*; +import static com.github.dreamhead.moco.Moco.and; +import static com.github.dreamhead.moco.Moco.by; +import static com.github.dreamhead.moco.Moco.cookie; +import static com.github.dreamhead.moco.Moco.eq; +import static com.github.dreamhead.moco.Moco.form; +import static com.github.dreamhead.moco.Moco.header; +import static com.github.dreamhead.moco.Moco.httpServer; +import static com.github.dreamhead.moco.Moco.method; +import static com.github.dreamhead.moco.Moco.not; +import static com.github.dreamhead.moco.Moco.query; +import static com.github.dreamhead.moco.Moco.text; +import static com.github.dreamhead.moco.Moco.uri; +import static com.github.dreamhead.moco.Moco.with; import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; import static org.junit.Assert.assertTrue; /** @@ -322,5 +334,13 @@ public void run() throws Exception { }); } + @Test + public void test_no_task_download(){ + Request request = new Request(); + request.setUrl("http://127.0.0.1:13423/"); + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + assertThrows(NullPointerException.class, () -> httpClientDownloader.download(request,null)); + } + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java index 3aa742c10..58dd3a6fa 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java @@ -1,13 +1,15 @@ package us.codecraft.webmagic.downloader; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; import org.apache.commons.io.IOUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.PlainText; -import java.io.IOException; -import java.io.InputStream; /** * @author code4crafter@gmail.com @@ -19,7 +21,7 @@ public Page download(Request request, Task task) { Page page = new Page(); InputStream resourceAsStream = this.getClass().getResourceAsStream("/html/mock-github.html"); try { - page.setRawText(IOUtils.toString(resourceAsStream)); + page.setRawText(IOUtils.toString(resourceAsStream, Charset.defaultCharset())); } catch (IOException e) { e.printStackTrace(); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java new file mode 100644 index 000000000..ebb1225cc --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java @@ -0,0 +1,40 @@ +package us.codecraft.webmagic.processor; + +import static org.junit.Assert.assertEquals; + +import org.junit.Test; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; + +public class PageProcessorTest { + + @Test + public void testGetSite() { + Site actualSite = new PageProcessor() { + + @Override + public void process(Page page) { + } + + }.getSite(); + + assertEquals(Site.me(), actualSite); + + actualSite = new PageProcessor() { + + @Override + public void process(Page page) { + } + + @Override + public Site getSite() { + return Site.me().setTimeOut(123); + }; + + }.getSite(); + + assertEquals(Site.me().setTimeOut(123), actualSite); + } + +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java index 8e4c82026..61fc6ab8b 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java @@ -8,19 +8,19 @@ import java.util.List; import org.apache.http.HttpHost; -import org.junit.BeforeClass; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; /** * @author yxssfxwzy@sina.com May 30, 2014 - * + * */ -public class ProxyTest { +class ProxyTest { private static List httpProxyList = new ArrayList(); - @BeforeClass - public static void before() { + @BeforeAll + static void before() { // String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", // "0.0.0.4:0" }; String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" }; @@ -48,7 +48,7 @@ public void run() { } @Test - public void testCreate() { + void testCreate() { Proxy proxy = Proxy.create(URI.create("//127.0.0.1:8080")); assertNull(proxy.getScheme()); assertNull(proxy.getUsername()); @@ -86,7 +86,15 @@ public void testCreate() { } @Test - public void testToString() { + void testEqualsHashCode() { + var proxy0 = new Proxy("::1", 1080); + var proxy1 = new Proxy("::1", 1080); + assertEquals(proxy0, proxy1); + assertEquals(proxy0.hashCode(), proxy1.hashCode()); + } + + @Test + void testToString() { assertEquals("//127.0.0.1:8080", new Proxy("127.0.0.1", 8080).toString()); assertEquals("http://127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "http").toString()); assertEquals("//username:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", "password").toString()); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java index 6495b16bf..e9325a7a7 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java @@ -1,6 +1,9 @@ package us.codecraft.webmagic.proxy; import org.junit.Test; +import org.mockito.Mockito; + +import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; @@ -20,11 +23,12 @@ public void test_get_proxy() throws Exception { Proxy originProxy1 = new Proxy("127.0.0.1", 1087); Proxy originProxy2 = new Proxy("127.0.0.1", 1088); SimpleProxyProvider proxyProvider = SimpleProxyProvider.from(originProxy1, originProxy2); - Proxy proxy = proxyProvider.getProxy(TASK); + Request request = Mockito.mock(Request.class); + Proxy proxy = proxyProvider.getProxy(request, TASK); assertThat(proxy).isEqualTo(originProxy1); - proxy = proxyProvider.getProxy(TASK); + proxy = proxyProvider.getProxy(request, TASK); assertThat(proxy).isEqualTo(originProxy2); - proxy = proxyProvider.getProxy(TASK); + proxy = proxyProvider.getProxy(request, TASK); assertThat(proxy).isEqualTo(originProxy1); } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java new file mode 100644 index 000000000..59885ebd1 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java @@ -0,0 +1,59 @@ +package us.codecraft.webmagic.selector; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.Test; + +public class AndSelectorTest { + + @Test + public void testSelectList() { + String htmlContent = "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " HTML with XPath\n" + + "\n" + + "\n" + + "
\n" + + "
Item 1
\n" + + "
Item 2
\n" + + "
\n" + + "\n" + + ""; + List selectors = new ArrayList(); + selectors.add(new CssSelector("div")); + selectors.add(new XpathSelector("//div[@class='item1']")); + AndSelector andSelector = new AndSelector(selectors); + List result = andSelector.selectList(htmlContent); + assertEquals("
\n Item 1\n
", result.get(0)); + } + + @Test + public void testSelectList_NoResults() { + String htmlContent = "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " HTML with XPath\n" + + "\n" + + "\n" + + "
\n" + + "
Item 1
\n" + + "
Item 2
\n" + + "
\n" + + "\n" + + ""; + List selectors = new ArrayList(); + selectors.add(new CssSelector("div")); + selectors.add(new XpathSelector("//div[@class='item']")); + AndSelector andSelector = new AndSelector(selectors); + List result = andSelector.selectList(htmlContent); + assertEquals(0, result.size()); + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java new file mode 100644 index 000000000..8b1ace903 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java @@ -0,0 +1,39 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.Mock; +import org.mockito.Mockito; +import org.mockito.runners.MockitoJUnitRunner; + +import java.util.List; +import static org.junit.Assert.*; + +public class CssSelectorTest { + + @Test + public void testSelectElement() { + CssSelector cssSelector = new CssSelector("div"); + String htmlContent = "Dummy Page
Hello World!
"; + Document doc = Jsoup.parse(htmlContent); + Element dummyElement = doc.getElementById("dummyDiv"); + Element resultElement = cssSelector.selectElement(dummyElement); + assertNotNull(resultElement); + } + + @Test + public void testSelectList() { + CssSelector cssSelector = new CssSelector("div"); + String htmlContent = "Dummy Page
Hello World!
"; + Document doc = Jsoup.parse(htmlContent); + Element dummyElement = doc.getElementById("dummyDiv"); + List result = cssSelector.selectList(dummyElement); + assertEquals(1, result.size()); + assertEquals("[
\n Hello World!\n
]", result.toString()); + } + +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java new file mode 100644 index 000000000..24d87647c --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java @@ -0,0 +1,44 @@ +package us.codecraft.webmagic.selector; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.Test; + +public class OrSelectorTest { + @Test + public void testSelectList() { + String htmlContent = "\n" + + "\n" + + "\n" + + " \n" + + " \n" + + " HTML with XPath\n" + + "\n" + + "\n" + + "
\n" + + "
Item 1
\n" + + "
Item 2
\n" + + "
\n" + + "\n" + + ""; + String expectedResult = "[\n" + + " \n" + + " \n" + + " HTML with XPath\n" + + ",
\n" + + " Item 1\n" + + "
,
\n" + + " Item 2\n" + + "
]"; + List selectors = new ArrayList(); + selectors.add(new CssSelector("head")); + selectors.add(new XpathSelector("//div[@class='item1']")); + selectors.add(new XpathSelector("//div[@class='item2']")); + OrSelector orSelector = new OrSelector(selectors); + List result = orSelector.selectList(htmlContent); + assertEquals(expectedResult, result.toString()); + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java new file mode 100644 index 000000000..987a6f77a --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java @@ -0,0 +1,16 @@ +package us.codecraft.webmagic.utils; + +import static org.junit.jupiter.api.Assertions.assertNull; + +import java.io.IOException; + +import org.junit.jupiter.api.Test; + +class CharsetUtilsTest { + + @Test + void testDetectCharset() throws IOException { + assertNull(CharsetUtils.detectCharset(null, new byte[0])); + } + +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index 6afdeefe4..38c8295bb 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic.utils; +import static org.junit.Assert.assertNull; + import org.junit.Assert; import org.junit.Test; @@ -43,5 +45,9 @@ public void testGetDomain(){ Assert.assertEquals("www.dianping.com",UrlUtils.getDomain(url)); } + @Test + public void testGetCharset() { + assertNull(UrlUtils.getCharset(null)); + } } diff --git a/webmagic-core/src/test/resources/log4j.xml b/webmagic-core/src/test/resources/log4j.xml deleted file mode 100644 index c2b5a2f53..000000000 --- a/webmagic-core/src/test/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-core/src/test/resources/log4j2-test.xml b/webmagic-core/src/test/resources/log4j2-test.xml new file mode 100644 index 000000000..86aee5f59 --- /dev/null +++ b/webmagic-core/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml new file mode 100644 index 000000000..2b4a53460 --- /dev/null +++ b/webmagic-coverage/pom.xml @@ -0,0 +1,74 @@ + + + 4.0.0 + + + us.codecraft + webmagic + 1.0.3 + + + webmagic-coverage + pom + webmagic-coverage + Compute aggregated test code coverage + + + true + + + + + ${project.groupId} + webmagic-core + ${project.version} + + + ${project.groupId} + webmagic-extension + ${project.version} + + + ${project.groupId} + webmagic-scripts + ${project.version} + + + ${project.groupId} + webmagic-selenium + ${project.version} + + + ${project.groupId} + webmagic-saxon + ${project.version} + + + ${project.groupId} + webmagic-samples + ${project.version} + + + + + + + org.jacoco + jacoco-maven-plugin + + + + report-aggregate + + + + + + + + diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index bf7ff05d6..93faa4aaf 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -1,19 +1,35 @@ - + us.codecraft - webmagic-parent - 0.7.3 + webmagic + 1.0.3 4.0.0 webmagic-extension + + org.projectlombok + lombok + 1.18.32 + provided + redis.clients jedis + + org.assertj + assertj-core + test + com.google.guava guava @@ -24,10 +40,6 @@ webmagic-core ${project.version} - - junit - junit - diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 6055bdb0f..01f1af9a3 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -6,6 +6,7 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.PlainText; +import us.codecraft.webmagic.utils.HttpConstant; import java.io.*; @@ -16,73 +17,70 @@ * @version 0.5.3 */ public class PhantomJSDownloader extends AbstractDownloader { - - private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); + private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); private static String crawlJsPath; private static String phantomJsCommand = "phantomjs"; // default - private int retryNum; - private int threadNum; - public PhantomJSDownloader() { this.initPhantomjsCrawlPath(); } - + /** * 添加新的构造函数,支持phantomjs自定义命令 - * - * example: - * phantomjs.exe 支持windows环境 - * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 - * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException - * + *

+ * example: + * phantomjs.exe 支持windows环境 + * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 + * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException + * * @param phantomJsCommand phantomJsCommand */ public PhantomJSDownloader(String phantomJsCommand) { this.initPhantomjsCrawlPath(); PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } - + /** * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js *

      * crawl.js start --
-     * 
+     *
      *   var system = require('system');
      *   var url = system.args[1];
-     *   
+     *
      *   var page = require('webpage').create();
      *   page.settings.loadImages = false;
      *   page.settings.resourceTimeout = 5000;
-     *   
+     *
      *   page.open(url, function (status) {
      *       if (status != 'success') {
      *           console.log("HTTP request failed!");
      *       } else {
      *           console.log(page.content);
      *       }
-     *   
+     *
      *       page.close();
      *       phantom.exit();
      *   });
-     *   
+     *
      * -- crawl.js end
      * 
* 具体项目时可以将以上js代码复制下来使用 - * + *

* example: - * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); - * + * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); + * * @param phantomJsCommand phantomJsCommand - * @param crawlJsPath crawlJsPath + * @param crawlJsPath crawlJsPath */ public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) { - PhantomJSDownloader.phantomJsCommand = phantomJsCommand; - PhantomJSDownloader.crawlJsPath = crawlJsPath; + PhantomJSDownloader.phantomJsCommand = phantomJsCommand; + PhantomJSDownloader.crawlJsPath = crawlJsPath; } - + private void initPhantomjsCrawlPath() { - PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js "; + PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + + System.getProperty("file.separator") + "crawl.js "; } @Override @@ -90,61 +88,41 @@ public Page download(Request request, Task task) { if (logger.isInfoEnabled()) { logger.info("downloading page: " + request.getUrl()); } - String content = getPage(request); - if (content.contains("HTTP request failed")) { - for (int i = 1; i <= getRetryNum(); i++) { - content = getPage(request); - if (!content.contains("HTTP request failed")) { - break; - } - } - if (content.contains("HTTP request failed")) { - //when failed - Page page = new Page(); + + Page page = Page.fail(request); + try { + String content = getPage(request); + if (!content.contains("HTTP request failed")) { + page.setDownloadSuccess(true); + page.setRawText(content); + page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); - return page; + page.setStatusCode(HttpConstant.StatusCode.CODE_200); } + onSuccess(page, task); + } catch (Exception e) { + onError(page, task, e); + logger.warn("download page {} error", request.getUrl(), e); } - - Page page = new Page(); - page.setRawText(content); - page.setUrl(new PlainText(request.getUrl())); - page.setRequest(request); - page.setStatusCode(200); return page; } @Override public void setThread(int threadNum) { - this.threadNum = threadNum; + // ignore } - protected String getPage(Request request) { - try { - String url = request.getUrl(); - Runtime runtime = Runtime.getRuntime(); - Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); - InputStream is = process.getInputStream(); - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - StringBuffer stringBuffer = new StringBuffer(); - String line; - while ((line = br.readLine()) != null) { - stringBuffer.append(line).append("\n"); - } - return stringBuffer.toString(); - } catch (IOException e) { - e.printStackTrace(); + protected String getPage(Request request) throws Exception { + String url = request.getUrl(); + Runtime runtime = Runtime.getRuntime(); + Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); + InputStream is = process.getInputStream(); + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + StringBuilder builder = new StringBuilder(); + String line; + while ((line = br.readLine()) != null) { + builder.append(line).append("\n"); } - - return null; - } - - public int getRetryNum() { - return retryNum; - } - - public PhantomJSDownloader setRetryNum(int retryNum) { - this.retryNum = retryNum; - return this; + return builder.toString(); } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java index f1d2f84d4..673447586 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java @@ -1,5 +1,9 @@ package us.codecraft.webmagic.model; +import lombok.Getter; +import lombok.Setter; + +import us.codecraft.webmagic.model.sources.Source; import us.codecraft.webmagic.selector.Selector; /** @@ -7,18 +11,18 @@ * @author code4crafter@gmail.com
* @since 0.2.0 */ -class Extractor { +public class Extractor { + @Getter @Setter protected Selector selector; + @Getter protected final Source source; protected final boolean notNull; protected final boolean multi; - - static enum Source {Html, Url, RawHtml, RawText} - + public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { this.selector = selector; this.source = source; @@ -26,23 +30,11 @@ public Extractor(Selector selector, Source source, boolean notNull, boolean mult this.multi = multi; } - Selector getSelector() { - return selector; - } - - Source getSource() { - return source; - } - - boolean isNotNull() { + public boolean isNotNull() { return notNull; } - boolean isMulti() { + public boolean isMulti() { return multi; } - - void setSelector(Selector selector) { - this.selector = selector; - } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java index a2cba1332..d4cb5937f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java @@ -1,58 +1,33 @@ package us.codecraft.webmagic.model; import us.codecraft.webmagic.model.formatter.ObjectFormatter; +import us.codecraft.webmagic.model.sources.Source; import us.codecraft.webmagic.selector.Selector; import java.lang.reflect.Field; import java.lang.reflect.Method; +import lombok.Getter; +import lombok.Setter; + /** * Wrapper of field and extractor. * @author code4crafter@gmail.com
* @since 0.2.0 */ -class FieldExtractor extends Extractor { +public class FieldExtractor extends Extractor { + @Getter private final Field field; + @Getter @Setter private Method setterMethod; + @Getter @Setter private ObjectFormatter objectFormatter; public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) { super(selector, source, notNull, multi); this.field = field; } - - Field getField() { - return field; - } - - Selector getSelector() { - return selector; - } - - Source getSource() { - return source; - } - - void setSetterMethod(Method setterMethod) { - this.setterMethod = setterMethod; - } - - Method getSetterMethod() { - return setterMethod; - } - - boolean isNotNull() { - return notNull; - } - - ObjectFormatter getObjectFormatter() { - return objectFormatter; - } - - void setObjectFormatter(ObjectFormatter objectFormatter) { - this.objectFormatter = objectFormatter; - } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 1e25a46c0..751aafe76 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -3,17 +3,21 @@ import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import lombok.Getter; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.annotation.*; -import us.codecraft.webmagic.model.formatter.ObjectFormatter; +import us.codecraft.webmagic.model.fields.PageField; import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder; +import us.codecraft.webmagic.model.sources.Source; +import us.codecraft.webmagic.model.sources.SourceTextExtractor; +import us.codecraft.webmagic.model.sources.Source.*; import us.codecraft.webmagic.selector.*; import us.codecraft.webmagic.utils.ClassUtils; import us.codecraft.webmagic.utils.ExtractorUtils; import java.lang.annotation.Annotation; import java.lang.reflect.Field; -import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.ArrayList; import java.util.List; @@ -29,14 +33,19 @@ */ class PageModelExtractor { + @Getter private List targetUrlPatterns = new ArrayList(); + @Getter private Selector targetUrlRegionSelector; + @Getter private List helpUrlPatterns = new ArrayList(); + @Getter private Selector helpUrlRegionSelector; + @Getter private Class clazz; private List fieldExtractors; @@ -86,7 +95,7 @@ private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) { regexPattern = ".*"; } fieldExtractor = new FieldExtractor(field, - new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), + new RegexSelector(regexPattern), new Url(), extractByUrl.notNull(), extractByUrl.multi() || List.class.isAssignableFrom(field.getType())); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { @@ -112,7 +121,7 @@ private FieldExtractor getAnnotationExtractCombo(Class clazz, Field field) { default: selector = new AndSelector(ExtractorUtils.getSelectors(extractBies)); } - fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html, + fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? new RawHtml() : new SelectedHtml(), comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType())); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { @@ -127,26 +136,23 @@ private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) { ExtractBy extractBy = field.getAnnotation(ExtractBy.class); if (extractBy != null) { Selector selector = ExtractorUtils.getSelector(extractBy); - ExtractBy.Source source0 = extractBy.source(); - if (extractBy.type()== ExtractBy.Type.JsonPath){ - source0 = RawText; - } - FieldExtractor.Source source = null; - switch (source0){ + ExtractBy.Source extractSource = extractBy.source(); + if (extractBy.type()== ExtractBy.Type.JsonPath) + extractSource = RawText; + Source source = null; + switch (extractSource) { case RawText: - source = FieldExtractor.Source.RawText; + source = new RawText(); break; case RawHtml: - source = FieldExtractor.Source.RawHtml; + source = new RawHtml(); break; case SelectedHtml: - source =FieldExtractor.Source.Html; + source = new SelectedHtml(); break; default: - source =FieldExtractor.Source.Html; - + source = new SelectedHtml(); } - fieldExtractor = new FieldExtractor(field, selector, source, extractBy.notNull(), List.class.isAssignableFrom(field.getType())); fieldExtractor.setSetterMethod(getSetterMethod(clazz, field)); @@ -193,7 +199,7 @@ private void initClassExtractors() { annotation = clazz.getAnnotation(ExtractBy.class); if (annotation != null) { ExtractBy extractBy = (ExtractBy) annotation; - objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); + objectExtractor = new Extractor(new XpathSelector(extractBy.value()), new SelectedHtml(), extractBy.notNull(), extractBy.multi()); } } @@ -233,135 +239,15 @@ private Object processSingle(Page page, String html, boolean isRaw) { try { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { - if (fieldExtractor.isMulti()) { - List value; - switch (fieldExtractor.getSource()) { - case RawHtml: - value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) { - value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); - } else { - value = fieldExtractor.getSelector().selectList(html); - } - break; - case Url: - value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); - break; - case RawText: - value = fieldExtractor.getSelector().selectList(page.getRawText()); - break; - default: - value = fieldExtractor.getSelector().selectList(html); - } - if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) { - return null; - } - if (fieldExtractor.getObjectFormatter() != null) { - List converted = convert(value, fieldExtractor.getObjectFormatter()); - setField(o, fieldExtractor, converted); - } else { - setField(o, fieldExtractor, value); - } - } else { - String value; - switch (fieldExtractor.getSource()) { - case RawHtml: - value = page.getHtml().selectDocument(fieldExtractor.getSelector()); - break; - case Html: - if (isRaw) { - value = page.getHtml().selectDocument(fieldExtractor.getSelector()); - } else { - value = fieldExtractor.getSelector().select(html); - } - break; - case Url: - value = fieldExtractor.getSelector().select(page.getUrl().toString()); - break; - case RawText: - value = fieldExtractor.getSelector().select(page.getRawText()); - break; - default: - value = fieldExtractor.getSelector().select(html); - } - if (value == null && fieldExtractor.isNotNull()) { - return null; - } - if (fieldExtractor.getObjectFormatter() != null) { - Object converted = convert(value, fieldExtractor.getObjectFormatter()); - if (converted == null && fieldExtractor.isNotNull()) { - return null; - } - setField(o, fieldExtractor, converted); - } else { - setField(o, fieldExtractor, value); - } - } + PageField field = SourceTextExtractor.getText(page, html, isRaw, fieldExtractor); + if (!field.operation(o, fieldExtractor, logger)) + return null; } - if (AfterExtractor.class.isAssignableFrom(clazz)) { + if (AfterExtractor.class.isAssignableFrom(clazz)) ((AfterExtractor) o).afterProcess(page); - } - } catch (InstantiationException e) { - logger.error("extract fail", e); - } catch (IllegalAccessException e) { - logger.error("extract fail", e); - } catch (InvocationTargetException e) { + } catch (Exception e) { logger.error("extract fail", e); } return o; } - - private Object convert(String value, ObjectFormatter objectFormatter) { - try { - Object format = objectFormatter.format(value); - logger.debug("String {} is converted to {}", value, format); - return format; - } catch (Exception e) { - logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e); - } - return null; - } - - private List convert(List values, ObjectFormatter objectFormatter) { - List objects = new ArrayList(); - for (String value : values) { - Object converted = convert(value, objectFormatter); - if (converted != null) { - objects.add(converted); - } - } - return objects; - } - - private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { - if (value == null) { - return; - } - if (fieldExtractor.getSetterMethod() != null) { - fieldExtractor.getSetterMethod().invoke(o, value); - } - fieldExtractor.getField().set(o, value); - } - - Class getClazz() { - return clazz; - } - - List getTargetUrlPatterns() { - return targetUrlPatterns; - } - - List getHelpUrlPatterns() { - return helpUrlPatterns; - } - - Selector getTargetUrlRegionSelector() { - return targetUrlRegionSelector; - } - - Selector getHelpUrlRegionSelector() { - return helpUrlRegionSelector; - } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java new file mode 100644 index 000000000..4a4bf38a8 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java @@ -0,0 +1,42 @@ +package us.codecraft.webmagic.model.fields; + +import java.lang.reflect.InvocationTargetException; +import java.util.ArrayList; +import java.util.List; + +import org.slf4j.Logger; + +import lombok.Getter; +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.formatter.ObjectFormatter; + +public class MultipleField extends PageField { + @Getter + private List fieldNames; + + public MultipleField(List fieldNames) { + this.fieldNames = fieldNames; + } + + public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException { + if ((this.fieldNames == null || this.fieldNames.size() == 0) && fieldExtractor.isNotNull()) + return false; + if (fieldExtractor.getObjectFormatter() != null) { + List converted = this.convert(this.fieldNames, fieldExtractor.getObjectFormatter(), logger); + setField(o, fieldExtractor, converted); + } + else + setField(o, fieldExtractor, this.fieldNames); + return true; + } + + private List convert(List values, ObjectFormatter objectFormatter, Logger logger) { + List objects = new ArrayList<>(); + for (String value : values) { + Object converted = this.convert(value, objectFormatter, logger); + if (converted != null) + objects.add(converted); + } + return objects; + } +} \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java new file mode 100644 index 000000000..ad4428335 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java @@ -0,0 +1,31 @@ +package us.codecraft.webmagic.model.fields; + +import java.lang.reflect.InvocationTargetException; + +import org.slf4j.Logger; + +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.formatter.ObjectFormatter; + +public abstract class PageField { + public abstract boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException; + + protected Object convert(String value, ObjectFormatter objectFormatter, Logger logger) { + try { + Object format = objectFormatter.format(value); + logger.debug("String {} is converted to {}", value, format); + return format; + } catch (Exception e) { + logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e); + } + return null; + } + + protected void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { + if (value != null) { + if (fieldExtractor.getSetterMethod() != null) + fieldExtractor.getSetterMethod().invoke(o, value); + fieldExtractor.getField().set(o, value); + } + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java new file mode 100644 index 000000000..136a1c56e --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic.model.fields; + +import java.lang.reflect.InvocationTargetException; + +import org.slf4j.Logger; + +import lombok.Getter; +import us.codecraft.webmagic.model.FieldExtractor; + +public class SingleField extends PageField { + @Getter + private String fieldName; + + public SingleField(String fieldName) { + this.fieldName = fieldName; + } + + public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException { + if (fieldExtractor.getObjectFormatter() != null) { + Object converted = this.convert(this.fieldName, fieldExtractor.getObjectFormatter(), logger); + if (converted == null && fieldExtractor.isNotNull()) + return false; + setField(o, fieldExtractor, converted); + } else + setField(o, fieldExtractor, this.fieldName); + return true; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java new file mode 100644 index 000000000..f03b8864a --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java @@ -0,0 +1,85 @@ +package us.codecraft.webmagic.model.formatter; + +public interface BasicClassDetector { + Class detectBasicClass(Class type); +} + +class IntegerClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Integer.TYPE) || type.equals(Integer.class)) { + return Integer.class; + } + return null; + } +} + +class LongClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Long.TYPE) || type.equals(Long.class)) { + return Long.class; + } + return null; + } +} + +class DoubleClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Double.TYPE) || type.equals(Double.class)) { + return Double.class; + } + return null; + } +} + +class FloatClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Float.TYPE) || type.equals(Float.class)) { + return Float.class; + } + return null; + } +} + +class ShortClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Short.TYPE) || type.equals(Short.class)) { + return Short.class; + } + return null; + } +} + +class CharacterClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Character.TYPE) || type.equals(Character.class)) { + return Character.class; + } + return null; + } +} + +class ByteClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Byte.TYPE) || type.equals(Byte.class)) { + return Byte.class; + } + return null; + } +} + +class BooleanClassDetector implements BasicClassDetector { + @Override + public Class detectBasicClass(Class type) { + if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) { + return Boolean.class; + } + return null; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java index f9d76a845..2d4d85b0a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java @@ -24,28 +24,24 @@ public T format(String raw) throws Exception { } protected abstract T formatTrimmed(String raw) throws Exception; - public static final List> basicTypeFormatters = Arrays.>asList(IntegerFormatter.class, LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class, CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class); + public static final List basicClassDetector= Arrays.asList(new IntegerClassDetector(), + new LongClassDetector(), + new FloatClassDetector(), + new DoubleClassDetector(), + new ShortClassDetector(), + new ByteClassDetector(), + new BooleanClassDetector(), + new CharacterClassDetector()); public static Class detectBasicClass(Class type) { - if (type.equals(Integer.TYPE) || type.equals(Integer.class)) { - return Integer.class; - } else if (type.equals(Long.TYPE) || type.equals(Long.class)) { - return Long.class; - } else if (type.equals(Double.TYPE) || type.equals(Double.class)) { - return Double.class; - } else if (type.equals(Float.TYPE) || type.equals(Float.class)) { - return Float.class; - } else if (type.equals(Short.TYPE) || type.equals(Short.class)) { - return Short.class; - } else if (type.equals(Character.TYPE) || type.equals(Character.class)) { - return Character.class; - } else if (type.equals(Byte.TYPE) || type.equals(Byte.class)) { - return Byte.class; - } else if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) { - return Boolean.class; + for (BasicClassDetector detector : basicClassDetector) { + Class detectedClass = detector.detectBasicClass(type); + if (detectedClass != null) { + return detectedClass; + } } return type; } @@ -146,5 +142,4 @@ public Class clazz() { } } - } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java new file mode 100644 index 000000000..146827220 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java @@ -0,0 +1,68 @@ +package us.codecraft.webmagic.model.sources; + +import java.util.List; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.FieldExtractor; + +public interface Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor); + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor); + + public class RawHtml implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return page.getHtml().selectDocument(fieldExtractor.getSelector()); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + } + } + + public class SelectedHtml implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + if (isRaw) + return page.getHtml().selectDocument(fieldExtractor.getSelector()); + else + return fieldExtractor.getSelector().select(html); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + if (isRaw) + return page.getHtml().selectDocumentForList(fieldExtractor.getSelector()); + else + return fieldExtractor.getSelector().selectList(html); + } + } + + public class Url implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().select(page.getUrl().toString()); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().selectList(page.getUrl().toString()); + } + } + + public class RawText implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().select(page.getRawText()); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().selectList(page.getRawText()); + } + } + + public class DefaultSource implements Source { + public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().select(html); + } + + public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + return fieldExtractor.getSelector().selectList(html); + } + } +} + diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java new file mode 100644 index 000000000..1e572695f --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java @@ -0,0 +1,17 @@ +package us.codecraft.webmagic.model.sources; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.model.FieldExtractor; +import us.codecraft.webmagic.model.fields.MultipleField; +import us.codecraft.webmagic.model.fields.PageField; +import us.codecraft.webmagic.model.fields.SingleField; + +public class SourceTextExtractor { + public static PageField getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) { + Source source = fieldExtractor.getSource(); + if (fieldExtractor.isMulti()) + return new MultipleField(source.getTextList(page, html, isRaw, fieldExtractor)); + else + return new SingleField(source.getText(page, html, isRaw, fieldExtractor)); + } +} \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index cfb4a8200..50dbcaf1a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -1,21 +1,25 @@ package us.codecraft.webmagic.monitor; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.SpiderListener; -import us.codecraft.webmagic.utils.Experimental; -import us.codecraft.webmagic.utils.UrlUtils; - -import javax.management.*; import java.lang.management.ManagementFactory; import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; +import javax.management.InstanceAlreadyExistsException; +import javax.management.JMException; +import javax.management.MBeanRegistrationException; +import javax.management.MBeanServer; +import javax.management.MalformedObjectNameException; +import javax.management.NotCompliantMBeanException; +import javax.management.ObjectName; + +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.SpiderListener; +import us.codecraft.webmagic.utils.Experimental; +import us.codecraft.webmagic.utils.UrlUtils; + /** * @author code4crafer@gmail.com * @since 0.5.0 @@ -23,17 +27,13 @@ @Experimental public class SpiderMonitor { - private static SpiderMonitor INSTANCE = new SpiderMonitor(); - - private AtomicBoolean started = new AtomicBoolean(false); - - private Logger logger = LoggerFactory.getLogger(getClass()); + private static final SpiderMonitor INSTANCE = new SpiderMonitor(); private MBeanServer mbeanServer; private String jmxServerName; - private List spiderStatuses = new ArrayList(); + private List spiderStatuses = new ArrayList<>(); protected SpiderMonitor() { jmxServerName = "WebMagic"; @@ -51,7 +51,7 @@ public synchronized SpiderMonitor register(Spider... spiders) throws JMException for (Spider spider : spiders) { MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener(); if (spider.getSpiderListeners() == null) { - List spiderListeners = new ArrayList(); + List spiderListeners = new ArrayList<>(); spiderListeners.add(monitorSpiderListener); spider.setSpiderListeners(spiderListeners); } else { @@ -68,6 +68,10 @@ protected SpiderStatusMXBean getSpiderStatusMBean(Spider spider, MonitorSpiderLi return new SpiderStatus(spider, monitorSpiderListener); } + protected List getSpiderStatuses() { + return this.spiderStatuses; + } + public static SpiderMonitor instance() { return INSTANCE; } @@ -86,7 +90,7 @@ public void onSuccess(Request request) { } @Override - public void onError(Request request) { + public void onError(Request request, Exception e) { errorUrls.add(request.getUrl()); errorCount.incrementAndGet(); } @@ -105,7 +109,6 @@ public List getErrorUrls() { } protected void registerMBean(SpiderStatusMXBean spiderStatus) throws MalformedObjectNameException, InstanceAlreadyExistsException, MBeanRegistrationException, NotCompliantMBeanException { -// ObjectName objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName()); ObjectName objName = new ObjectName(jmxServerName + ":name=" + UrlUtils.removePort(spiderStatus.getName())); mbeanServer.registerMBean(spiderStatus, objName); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java index a87c040bd..69afe042a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java @@ -84,8 +84,13 @@ public Date getStartTime() { @Override public int getPagePerSecond() { - int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000; - return getSuccessPageCount() / runSeconds; + if (getStartTime() != null) { + int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000; + if (runSeconds != 0) { + return getSuccessPageCount() / runSeconds; + } + } + return -1; } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index fec3c1db9..0dabdd954 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -1,29 +1,13 @@ package us.codecraft.webmagic.scheduler; -import java.io.BufferedReader; -import java.io.Closeable; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.FileWriter; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.LinkedHashSet; -import java.util.Set; -import java.util.concurrent.BlockingQueue; -import java.util.concurrent.Executors; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; - -import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.math.NumberUtils; - import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.scheduler.component.DuplicateRemover; + +import java.io.*; +import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; /** @@ -32,7 +16,7 @@ * @author code4crafter@gmail.com
* @since 0.2.0 */ -public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler,Closeable { +public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, Closeable { private String filePath = System.getProperty("java.io.tmpdir"); @@ -52,8 +36,6 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement private BlockingQueue queue; - private Set urls; - private ScheduledExecutorService flushThreadPool; public FileCacheQueueScheduler(String filePath) { @@ -83,36 +65,13 @@ private void init(Task task) { } private void initDuplicateRemover() { - setDuplicateRemover( - new DuplicateRemover() { - @Override - public boolean isDuplicate(Request request, Task task) { - if (!inited.get()) { - init(task); - } - return !urls.add(request.getUrl()); - } - - @Override - public void resetDuplicateCheck(Task task) { - urls.clear(); - } - - @Override - public int getTotalRequestsCount(Task task) { - return urls.size(); - } - }); + BloomFilterDuplicateRemover bloomFilterDuplicateRemover = new BloomFilterDuplicateRemover(this.filePath.hashCode()); + setDuplicateRemover(bloomFilterDuplicateRemover); } private void initFlushThread() { - flushThreadPool = Executors.newScheduledThreadPool(1); - flushThreadPool.scheduleAtFixedRate(new Runnable() { - @Override - public void run() { - flush(); - } - }, 10, 10, TimeUnit.SECONDS); + flushThreadPool = Executors.newScheduledThreadPool(1); + flushThreadPool.scheduleAtFixedRate(this::flush, 10, 10, TimeUnit.SECONDS); } private void initWriter() { @@ -127,7 +86,6 @@ private void initWriter() { private void readFile() { try { queue = new LinkedBlockingQueue(); - urls = new LinkedHashSet(); readCursorFile(); readUrlFile(); // initDuplicateRemover(); @@ -140,46 +98,43 @@ private void readFile() { } private void readUrlFile() throws IOException { - String line; - BufferedReader fileUrlReader = null; - try { - fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName))); + try (BufferedReader fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName)))) { + String line; int lineReaded = 0; while ((line = fileUrlReader.readLine()) != null) { - urls.add(line.trim()); + Request request = deserializeRequest(line); + this.getDuplicateRemover().isDuplicate(request, null); lineReaded++; if (lineReaded > cursor.get()) { - queue.add(deserializeRequest(line)); + queue.add(request); } } - } finally { - if (fileUrlReader != null) { - IOUtils.closeQuietly(fileUrlReader); - } } } private void readCursorFile() throws IOException { - BufferedReader fileCursorReader = null; - try { - fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor))); + String fileName = getFileName(fileCursor); + try (BufferedReader fileCursorReader = new BufferedReader(new FileReader(fileName))) { String line; + String lastLine = null; //read the last number while ((line = fileCursorReader.readLine()) != null) { - cursor = new AtomicInteger(NumberUtils.toInt(line)); + line = line.trim(); + if (!line.isEmpty()) { + lastLine = line; + } } - } finally { - if (fileCursorReader != null) { - IOUtils.closeQuietly(fileCursorReader); + if (lastLine != null) { + cursor.set(NumberUtils.toInt(line)); } } } - + public void close() throws IOException { - flushThreadPool.shutdown(); - fileUrlWriter.close(); - fileCursorWriter.close(); - } + flushThreadPool.shutdown(); + fileUrlWriter.close(); + fileCursorWriter.close(); + } private String getFileName(String filename) { return filePath + task.getUUID() + filename; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java index 540574ad2..7abe5bfad 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java @@ -1,22 +1,23 @@ package us.codecraft.webmagic.scheduler; -import com.alibaba.fastjson.JSON; +import java.util.Set; + import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; + +import com.alibaba.fastjson.JSON; + import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; -import java.util.Set; - /** * the redis scheduler with priority * @author sai * Created by sai on 16-5-27. */ -public class RedisPriorityScheduler extends RedisScheduler -{ +public class RedisPriorityScheduler extends RedisScheduler { private static final String ZSET_PREFIX = "zset_"; @@ -37,62 +38,44 @@ public RedisPriorityScheduler(JedisPool pool) { } @Override - protected void pushWhenNoDuplicate(Request request, Task task) - { - Jedis jedis = pool.getResource(); - try - { - if(request.getPriority() > 0) + protected void pushWhenNoDuplicate(Request request, Task task) { + try (Jedis jedis = pool.getResource()) { + if (request.getPriority() > 0) { jedis.zadd(getZsetPlusPriorityKey(task), request.getPriority(), request.getUrl()); - else if(request.getPriority() < 0) + } else if (request.getPriority() < 0) { jedis.zadd(getZsetMinusPriorityKey(task), request.getPriority(), request.getUrl()); - else + } else { jedis.lpush(getQueueNoPriorityKey(task), request.getUrl()); + } setExtrasInItem(jedis, request, task); } - finally - { - pool.returnResource(jedis); - } } @Override - public synchronized Request poll(Task task) - { - Jedis jedis = pool.getResource(); - try - { + public synchronized Request poll(Task task) { + try (Jedis jedis = pool.getResource()) { String url = getRequest(jedis, task); - if(StringUtils.isBlank(url)) + if (StringUtils.isBlank(url)) { return null; + } return getExtrasInItem(jedis, url, task); } - finally - { - pool.returnResource(jedis); - } } - private String getRequest(Jedis jedis, Task task) - { + private String getRequest(Jedis jedis, Task task) { String url; Set urls = jedis.zrevrange(getZsetPlusPriorityKey(task), 0, 0); - if(urls.isEmpty()) - { + if (urls.isEmpty()) { url = jedis.lpop(getQueueNoPriorityKey(task)); - if(StringUtils.isBlank(url)) - { + if (StringUtils.isBlank(url)) { urls = jedis.zrevrange(getZsetMinusPriorityKey(task), 0, 0); - if(!urls.isEmpty()) - { + if (!urls.isEmpty()) { url = urls.toArray(new String[0])[0]; jedis.zrem(getZsetMinusPriorityKey(task), url); } } - } - else - { + } else { url = urls.toArray(new String[0])[0]; jedis.zrem(getZsetPlusPriorityKey(task), url); } @@ -100,51 +83,39 @@ private String getRequest(Jedis jedis, Task task) } @Override - public void resetDuplicateCheck(Task task) - { - Jedis jedis = pool.getResource(); - try - { + public void resetDuplicateCheck(Task task) { + try (Jedis jedis = pool.getResource()) { jedis.del(getSetKey(task)); } - finally - { - pool.returnResource(jedis); - } } - private String getZsetPlusPriorityKey(Task task) - { + private String getZsetPlusPriorityKey(Task task) { return ZSET_PREFIX + task.getUUID() + PLUS_PRIORITY_SUFFIX; } - private String getQueueNoPriorityKey(Task task) - { + private String getQueueNoPriorityKey(Task task) { return QUEUE_PREFIX + task.getUUID() + NO_PRIORITY_SUFFIX; } - private String getZsetMinusPriorityKey(Task task) - { + private String getZsetMinusPriorityKey(Task task) { return ZSET_PREFIX + task.getUUID() + MINUS_PRIORITY_SUFFIX; } - private void setExtrasInItem(Jedis jedis,Request request, Task task) - { - if(request.getExtras() != null) - { - String field = DigestUtils.shaHex(request.getUrl()); + private void setExtrasInItem(Jedis jedis,Request request, Task task) { + if (!request.getExtras().isEmpty()) { + String field = DigestUtils.sha1Hex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset(getItemKey(task), field, value); } } - private Request getExtrasInItem(Jedis jedis, String url, Task task) - { + private Request getExtrasInItem(Jedis jedis, String url, Task task) { String key = getItemKey(task); - String field = DigestUtils.shaHex(url); + String field = DigestUtils.sha1Hex(url); byte[] bytes = jedis.hget(key.getBytes(), field.getBytes()); - if(bytes != null) + if (bytes != null) { return JSON.parseObject(new String(bytes), Request.class); + } return new Request(url); } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index c70d88507..8d61bea3b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -1,8 +1,10 @@ package us.codecraft.webmagic.scheduler; -import com.alibaba.fastjson.JSON; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; + +import com.alibaba.fastjson.JSON; + import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import redis.clients.jedis.JedisPoolConfig; @@ -37,21 +39,15 @@ public RedisScheduler(JedisPool pool) { @Override public void resetDuplicateCheck(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { jedis.del(getSetKey(task)); - } finally { - pool.returnResource(jedis); } } @Override public boolean isDuplicate(Request request, Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { return jedis.sadd(getSetKey(task), request.getUrl()) == 0; - } finally { - pool.returnResource(jedis); } } @@ -62,7 +58,7 @@ protected void pushWhenNoDuplicate(Request request, Task task) { try { jedis.rpush(getQueueKey(task), request.getUrl()); if (checkForAdditionalInfo(request)) { - String field = DigestUtils.shaHex(request.getUrl()); + String field = DigestUtils.sha1Hex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); } @@ -88,7 +84,7 @@ private boolean checkForAdditionalInfo(Request request) { return true; } - if (request.getExtras() != null && !request.getExtras().isEmpty()) { + if (!request.getExtras().isEmpty()) { return true; } if (request.getPriority() != 0L) { @@ -100,14 +96,13 @@ private boolean checkForAdditionalInfo(Request request) { @Override public synchronized Request poll(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { String url = jedis.lpop(getQueueKey(task)); if (url == null) { return null; } String key = ITEM_PREFIX + task.getUUID(); - String field = DigestUtils.shaHex(url); + String field = DigestUtils.sha1Hex(url); byte[] bytes = jedis.hget(key.getBytes(), field.getBytes()); if (bytes != null) { Request o = JSON.parseObject(new String(bytes), Request.class); @@ -115,8 +110,6 @@ public synchronized Request poll(Task task) { } Request request = new Request(url); return request; - } finally { - pool.returnResource(jedis); } } @@ -134,23 +127,17 @@ protected String getItemKey(Task task) { @Override public int getLeftRequestsCount(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { Long size = jedis.llen(getQueueKey(task)); return size.intValue(); - } finally { - pool.returnResource(jedis); } } @Override public int getTotalRequestsCount(Task task) { - Jedis jedis = pool.getResource(); - try { + try (Jedis jedis = pool.getResource()) { Long size = jedis.scard(getSetKey(task)); return size.intValue(); - } finally { - pool.returnResource(jedis); } } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java old mode 100755 new mode 100644 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java old mode 100755 new mode 100644 diff --git a/webmagic-extension/src/main/resources/log4j.xml b/webmagic-extension/src/main/resources/log4j.xml deleted file mode 100644 index c2b5a2f53..000000000 --- a/webmagic-extension/src/main/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java index 63c40d295..c2081dbf3 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java @@ -13,7 +13,6 @@ /** * @author code4crafter@gmail.com - * @date 14-4-5 */ public class ConfigurablePageProcessorTest { diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java index 91e3698cf..bb18aa2c5 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java @@ -938,6 +938,7 @@ public Page download(Request request, Task task) { Page page = new Page(); page.setRawText(html); page.setStatusCode(200); + page.setDownloadSuccess(true); page.setRequest(new Request("https://github.com/code4craft/webmagic")); page.setUrl(new PlainText("https://github.com/code4craft/webmagic")); return page; diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java index 627fa6e84..1014a45f5 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java @@ -12,7 +12,6 @@ /** * @author code4crafter@gmail.com - * @date 14-4-4 */ public class ModelPageProcessorTest { diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java index 4b0c133cb..0451edcfe 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java @@ -1,11 +1,13 @@ package us.codecraft.webmagic.model; + +import java.io.IOException; +import java.nio.charset.Charset; import org.apache.commons.io.IOUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.selector.PlainText; -import java.io.IOException; /** * @author code4crafter@gmail.com @@ -16,7 +18,7 @@ public class PageMocker { public Page getMockJsonPage() throws IOException { Page page = new Page(); - page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json"))); + page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json"), Charset.defaultCharset())); page.setRequest(new Request("https://api.github.com/repos/code4craft/webmagic")); page.setUrl(new PlainText("https://api.github.com/repos/code4craft/webmagic")); return page; @@ -24,7 +26,7 @@ public Page getMockJsonPage() throws IOException { public Page getMockPage() throws IOException { Page page = new Page(); - page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html"))); + page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html"), Charset.defaultCharset())); page.setRequest(new Request("http://webmagic.io/list/0")); page.setUrl(new PlainText("http://webmagic.io/list/0")); return page; diff --git a/webmagic-extension/src/test/resources/log4j.xml b/webmagic-extension/src/test/resources/log4j.xml deleted file mode 100644 index c2b5a2f53..000000000 --- a/webmagic-extension/src/test/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-extension/src/test/resources/log4j2-test.xml b/webmagic-extension/src/test/resources/log4j2-test.xml new file mode 100644 index 000000000..86aee5f59 --- /dev/null +++ b/webmagic-extension/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 44fee7c0d..50e79c73e 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -1,9 +1,14 @@ - + - webmagic-parent us.codecraft - 0.7.3 + webmagic + 1.0.3 4.0.0 @@ -21,8 +26,24 @@ ${project.version} - junit - junit + org.mapdb + mapdb + 3.1.0 + + + com.fasterxml.jackson.core + jackson-core + 2.15.2 + + + com.fasterxml.jackson.core + jackson-annotations + 2.15.2 + + + com.fasterxml.jackson.core + jackson-databind + 2.16.0 diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java index 941bdbde8..136e88d9e 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java @@ -49,7 +49,7 @@ public static void main(String[] args) { @Override public String key() { - return author+":"+name; + return author+"_"+name; } public String getName() { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java new file mode 100644 index 000000000..bee80e775 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java @@ -0,0 +1,78 @@ +package us.codecraft.webmagic.recover; + +import com.google.common.base.Charsets; +import com.google.common.hash.BloomFilter; +import com.google.common.hash.Funnels; +import org.mapdb.DB; +import org.mapdb.DBMaker; +import org.mapdb.IndexTreeList; +import org.mapdb.Serializer; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; + +import java.util.concurrent.atomic.AtomicInteger; + +/** + * @author :linweisen + */ +public class DuplicateStorageRemover implements DuplicateRemover { + + private DB db; + + private static String DATABASE_NAME = "duplicate"; + + private IndexTreeList urlDuplicateQueue; + + private BloomFilter bloomFilter; + + private AtomicInteger counter; + + public DuplicateStorageRemover(String path) { + + String duplicatStoragePath = path; + + DB db = DBMaker.fileDB(duplicatStoragePath) + .fileMmapEnableIfSupported() + .fileMmapPreclearDisable() + .cleanerHackEnable() + .closeOnJvmShutdown() + .transactionEnable() + .concurrencyScale(128) + .make(); + this.db = db; + + this.urlDuplicateQueue = db.indexTreeList(DATABASE_NAME, Serializer.STRING).createOrOpen(); + + counter = new AtomicInteger(this.urlDuplicateQueue.size()); + this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7); + for (String url : this.urlDuplicateQueue){ + bloomFilter.put(url); + } + + } + + @Override + public boolean isDuplicate(Request request, Task task) { + String url = request.getUrl(); + boolean isDuplicate = bloomFilter.mightContain(url); + if (!isDuplicate) { + bloomFilter.put(url); + urlDuplicateQueue.add(url); + this.db.commit(); + counter.incrementAndGet(); + } + return isDuplicate; + } + + @Override + public void resetDuplicateCheck(Task task) { + this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7); + this.urlDuplicateQueue.clear(); + } + + @Override + public int getTotalRequestsCount(Task task) { + return counter.get(); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java new file mode 100644 index 000000000..4cee18afd --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java @@ -0,0 +1,85 @@ +package us.codecraft.webmagic.recover; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.commons.lang3.StringUtils; +import org.mapdb.DB; +import org.mapdb.DBMaker; +import org.mapdb.IndexTreeList; +import org.mapdb.Serializer; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.DuplicateRemovedScheduler; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; + +import java.io.IOException; + +/** + * @author :linweisen + */ +public class MmapQueueScheduler extends DuplicateRemovedScheduler { + + private DB db; + + private static String DATABASE_NAME = "queue"; + + private IndexTreeList queue; + + private static ObjectMapper mapper; + + public MmapQueueScheduler(DuplicateRemover duplicateRemover, String path) { + super.setDuplicateRemover(duplicateRemover); + + String queuePath = path; + + DB db = DBMaker.fileDB(queuePath) + .fileMmapEnableIfSupported() + .fileMmapPreclearDisable() + .cleanerHackEnable() + .closeOnJvmShutdown() + .transactionEnable() + .concurrencyScale(128) + .make(); + this.db = db; + this.mapper = new ObjectMapper(); + this.queue = db.indexTreeList(MmapQueueScheduler.DATABASE_NAME, Serializer.STRING).createOrOpen(); + } + + @Override + public Request poll(Task task) { + if (this.queue.size() > 0){ + String s = queue.remove(0); + return fromJson(s, Request.class); + }else{ + return null; + } + + } + + @Override + public void pushWhenNoDuplicate(Request request, Task task) { + queue.add(toJson(request)); + this.db.commit(); + } + + public String toJson(Object object) { + try { + return mapper.writeValueAsString(object); + } catch (IOException e) { + logger.warn("write to json string error:" + object, e); + return null; + } + } + + public T fromJson(String jsonString, Class clazz) { + if (StringUtils.isEmpty(jsonString)) { + return null; + } + try { + return mapper.readValue(jsonString, clazz); + } catch (IOException e) { + logger.warn("parse json string error:" + jsonString, e); + return null; + } + } + +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java new file mode 100644 index 000000000..4fb91a0d2 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java @@ -0,0 +1,22 @@ +package us.codecraft.webmagic.recover; + + +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.samples.SinaBlogProcessor; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; + +/** + * @author code4crafter@gmail.com
+ */ +public class RecoverSample { + + public static void main(String[] args) { + String storage = "queue"; + String duplicate = "duplicate"; + Spider spider = new Spider(new SinaBlogProcessor()); + DuplicateRemover remover = new DuplicateStorageRemover(duplicate); + spider.setScheduler(new MmapQueueScheduler(remover, storage)); + spider.addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html") + .run(); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java index ab560e451..46476bbc8 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java @@ -1,14 +1,14 @@ package us.codecraft.webmagic.samples; -import org.apache.commons.collections.CollectionUtils; + +import java.util.List; +import org.apache.commons.collections4.CollectionUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.JsonPathSelector; -import java.util.List; - /** * @author code4crafter@gmail.com * @since 0.5.0 diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java index 280f8f186..33dd6aa35 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java @@ -1,6 +1,6 @@ package us.codecraft.webmagic.samples; -import org.apache.commons.collections.CollectionUtils; +import org.apache.commons.collections4.CollectionUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java index 99d5fa84e..ab5314073 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java @@ -36,7 +36,7 @@ public Site getSite() { } public static void main(String[] args) throws Exception { - PhantomJSDownloader phantomDownloader = new PhantomJSDownloader().setRetryNum(3); + PhantomJSDownloader phantomDownloader = new PhantomJSDownloader(); CollectorPipeline collectorPipeline = new ResultItemsCollectorPipeline(); diff --git a/webmagic-samples/src/main/resources/log4j.xml b/webmagic-samples/src/main/resources/log4j.xml deleted file mode 100644 index a6630f813..000000000 --- a/webmagic-samples/src/main/resources/log4j.xml +++ /dev/null @@ -1,26 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-samples/src/main/resources/log4j2.xml b/webmagic-samples/src/main/resources/log4j2.xml new file mode 100644 index 000000000..f3bad53d8 --- /dev/null +++ b/webmagic-samples/src/main/resources/log4j2.xml @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index da0c5f202..26d1989d6 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -1,14 +1,23 @@ - + - webmagic-parent us.codecraft - 0.7.3 + webmagic + 1.0.3 4.0.0 webmagic-saxon + + true + + ${project.groupId} @@ -23,23 +32,6 @@ net.sf.saxon Saxon-HE - - junit - junit - - - - - org.apache.maven.plugins - maven-deploy-plugin - 3.0.0-M1 - - true - - - - - diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java new file mode 100644 index 000000000..b03f3a2ab --- /dev/null +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java @@ -0,0 +1,61 @@ +package us.codecraft.webmagic.selector; + +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * @author hooy + */ +public final class JaxpSelectorUtils { + + private JaxpSelectorUtils() { + throw new RuntimeException("The util class cannot be instanced"); + } + + public static List NodeListToArrayList(NodeList nodes) { + List list = new ArrayList<>(nodes.getLength()); + for (int i = 0; i < nodes.getLength(); i++) { + list.add(nodes.item(i)); + } + return list; + } + + public static String nodeToString(Node node) throws TransformerException { + List before = Collections.singletonList(node); + List after = nodesToStrings(before); + if (after.size() > 0) { + return after.get(0); + } else { + return null; + } + } + + public static List nodesToStrings(List nodes) throws TransformerException { + List results = new ArrayList<>(nodes.size()); + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + StreamResult xmlOutput = new StreamResult(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + for (Node node : nodes) { + if (node.getNodeType() == Node.ATTRIBUTE_NODE || node.getNodeType() == Node.TEXT_NODE) { + results.add(node.getTextContent()); + } else { + xmlOutput.setWriter(new StringWriter()); + transformer.transform(new DOMSource(node), xmlOutput); + results.add(xmlOutput.getWriter().toString()); + } + } + return results; + } + +} diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java new file mode 100644 index 000000000..3e6339dda --- /dev/null +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.selector; + +import org.w3c.dom.Node; + +import java.util.List; + +/** + * Selector(extractor) for html node.
+ * + * @author hooy
+ * @since 0.8.0 + */ +public interface NodeSelector { + + /** + * Extract single result in text.
+ * If there are more than one result, only the first will be chosen. + * + * @param node node + * @return result + */ + String select(Node node); + + /** + * Extract all results in text.
+ * + * @param node node + * @return results + */ + List selectList(Node node); + +} diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index d8aab6cce..6c5d7b332 100644 --- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -1,7 +1,14 @@ package us.codecraft.webmagic.selector; -import net.sf.saxon.lib.NamespaceConstant; -import net.sf.saxon.xpath.XPathEvaluator; +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; + +import javax.xml.namespace.NamespaceContext; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathExpressionException; + import org.htmlcleaner.CleanerProperties; import org.htmlcleaner.DomSerializer; import org.htmlcleaner.HtmlCleaner; @@ -12,36 +19,26 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; -import javax.xml.namespace.NamespaceContext; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.Transformer; -import javax.xml.transform.TransformerFactory; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; -import javax.xml.xpath.XPathConstants; -import javax.xml.xpath.XPathExpression; -import javax.xml.xpath.XPathExpressionException; -import java.io.StringWriter; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; +import net.sf.saxon.lib.NamespaceConstant; +import net.sf.saxon.xpath.XPathEvaluator; +import us.codecraft.webmagic.utils.BaseSelectorUtils; + +import static us.codecraft.webmagic.selector.JaxpSelectorUtils.*; /** * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
* - * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午9:39 + * @author code4crafter@gmail.com, hooy
+ * Date: 13-4-21 + * Time: 上午9:39 */ -public class Xpath2Selector implements Selector { +public class Xpath2Selector implements Selector, NodeSelector { - private String xpathStr; + private final String xpathStr; private XPathExpression xPathExpression; - private Logger logger = LoggerFactory.getLogger(getClass()); + private final Logger logger = LoggerFactory.getLogger(getClass()); public Xpath2Selector(String xpathStr) { this.xpathStr = xpathStr; @@ -52,27 +49,28 @@ public Xpath2Selector(String xpathStr) { } } + public static Xpath2Selector newInstance(String xpathStr) { + return new Xpath2Selector(xpathStr); + } + enum XPath2NamespaceContext implements NamespaceContext { INSTANCE; - private final Map prefix2NamespaceMap = new ConcurrentHashMap(); + private final Map prefix2NamespaceMap = new ConcurrentHashMap<>(); - private final Map> namespace2PrefixMap = new ConcurrentHashMap>(); + private final Map> namespace2PrefixMap = new ConcurrentHashMap<>(); private void put(String prefix, String namespaceURI) { prefix2NamespaceMap.put(prefix, namespaceURI); - List prefixes = namespace2PrefixMap.get(namespaceURI); - if (prefixes == null) { - prefixes = new ArrayList(); - namespace2PrefixMap.put(namespaceURI, prefixes); - } + List prefixes = namespace2PrefixMap.computeIfAbsent(namespaceURI, k -> new ArrayList<>()); prefixes.add(prefix); } - private XPath2NamespaceContext() { + XPath2NamespaceContext() { put("fn", NamespaceConstant.FN); put("xslt", NamespaceConstant.XSLT); + put("xhtml", NamespaceConstant.XHTML); } @Override @@ -108,32 +106,18 @@ private void init() throws XPathExpressionException { @Override public String select(String text) { try { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(text); - Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); - Object result; - try { - result = xPathExpression.evaluate(document, XPathConstants.NODESET); - } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(document, XPathConstants.STRING); - } - if (result instanceof NodeList) { - NodeList nodeList = (NodeList) result; - if (nodeList.getLength() == 0) { - return null; - } - Node item = nodeList.item(0); - if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { - return item.getTextContent(); - } else { - StreamResult xmlOutput = new StreamResult(new StringWriter()); - Transformer transformer = TransformerFactory.newInstance().newTransformer(); - transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); - transformer.transform(new DOMSource(item), xmlOutput); - return xmlOutput.getWriter().toString(); - } - } - return result.toString(); + Document doc = parse(text); + return select(doc); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + @Override + public String select(Node node) { + try { + return (String) xPathExpression.evaluate(node, XPathConstants.STRING); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } @@ -142,38 +126,72 @@ public String select(String text) { @Override public List selectList(String text) { - List results = new ArrayList(); try { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(text); - Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); - Object result; - try { - result = xPathExpression.evaluate(document, XPathConstants.NODESET); - } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(document, XPathConstants.STRING); - } - if (result instanceof NodeList) { - NodeList nodeList = (NodeList) result; - Transformer transformer = TransformerFactory.newInstance().newTransformer(); - StreamResult xmlOutput = new StreamResult(); - transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); - for (int i = 0; i < nodeList.getLength(); i++) { - Node item = nodeList.item(i); - if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { - results.add(item.getTextContent()); - } else { - xmlOutput.setWriter(new StringWriter()); - transformer.transform(new DOMSource(item), xmlOutput); - results.add(xmlOutput.getWriter().toString()); - } - } - } else { - results.add(result.toString()); - } + Document doc = parse(text); + return selectList(doc); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } - return results; + return null; } + + @Override + public List selectList(Node node) { + try { + NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET); + List nodes = NodeListToArrayList(result); + return nodesToStrings(nodes); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + public Node selectNode(String text) { + try { + Document doc = parse(text); + return selectNode(doc); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + public Node selectNode(Node node) { + try { + return (Node) xPathExpression.evaluate(node, XPathConstants.NODE); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + public List selectNodes(String text) { + try { + Document doc = parse(text); + return selectNodes(doc); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + public List selectNodes(Node node) { + try { + NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET); + return NodeListToArrayList(result); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + protected static Document parse(String text) throws ParserConfigurationException { + // HtmlCleaner could not parse or tag directly + text = BaseSelectorUtils.preParse(text); + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + return new DomSerializer(new CleanerProperties()).createDOM(tagNode); + } + } diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index aa3765a0c..4033fcfbd 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic.selector; +import java.util.List; + import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import org.htmlcleaner.XPatherException; @@ -8,9 +10,16 @@ import org.junit.Assert; import org.junit.Ignore; import org.junit.Test; + +import org.w3c.dom.Node; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; +import javax.xml.transform.TransformerException; + /** * @author code4crafter@gmail.com
Date: 13-4-21 Time: 上午10:06 */ @@ -1367,46 +1376,50 @@ public void testOschina() { public void testXPath2() { String text = "

眉山:扎实推进农业农村工作 促农持续增收
\n" + "2013-07-31 23:29:45   来源:眉山网      责任编辑:张斯炜

"; - XpathSelector xpathSelector = new XpathSelector("//h1/text()"); - System.out.println(xpathSelector.select(text)); + Xpath2Selector xpathSelector = new Xpath2Selector("//h1/text()"); + Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收", xpathSelector.select(text)); } @Test public void testXpath2Selector() { - Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href"); + Xpath2Selector xpath2Selector = new Xpath2Selector("//xhtml:a/@href"); String select = xpath2Selector.select(html); - Assert.assertNotNull(select); + Assert.assertEquals("http://www.oschina.net/", select); + + List selectList = xpath2Selector.selectList(html); + Assert.assertEquals(113, selectList.size()); + Assert.assertEquals("http://www.oschina.net/", selectList.get(0)); } @Ignore("take long time") @Test public void performanceTest() { Xpath2Selector xpath2Selector = new Xpath2Selector("//a"); - long time =System.currentTimeMillis(); + long time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpath2Selector.selectList(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); XpathSelector xpathSelector = new XpathSelector("//a"); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpathSelector.selectList(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpath2Selector.selectList(html); } System.out.println(System.currentTimeMillis() - time); CssSelector cssSelector = new CssSelector("a"); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { cssSelector.selectList(html); } - System.out.println("css "+(System.currentTimeMillis()-time)); + System.out.println("css " + (System.currentTimeMillis() - time)); } @Ignore("take long time") @@ -1418,55 +1431,92 @@ public void parserPerformanceTest() throws XPatherException { TagNode tagNode = htmlCleaner.clean(html); Document document = Jsoup.parse(html); - long time =System.currentTimeMillis(); + long time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { htmlCleaner.clean(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { tagNode.evaluateXPath("//a"); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); System.out.println("============="); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { Jsoup.parse(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { document.select("a"); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); System.out.println("============="); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { htmlCleaner.clean(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { tagNode.evaluateXPath("//a"); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); System.out.println("============="); XPathEvaluator compile = Xsoup.compile("//a"); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { compile.evaluate(document); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); + + } + /** + * New api test + * + * @author hooy + * @since 8.0 + */ + private String rank = "

点击榜

排名分类书名/最新章节作者推荐更新时间
1.现实
0
11-24 22:32
2.架空
1047
03-04 14:44
3.现实
0
07-20 09:06
4.豪门
0
12-03 09:12
5.现实
0
02-01 21:12
6.玄奇
3455
02-28 12:31
7.玄奇
20614
03-31 12:37
8.复仇
55
06-03 11:43
9.穿越
0
10-27 18:50
10.宫斗
320
10-31 13:58
11.宫斗
6268
07-12 20:23
12.现实
0
01-18 23:00
13.婚恋
0
12-14 20:50
14.修真
0
02-03 23:40
15.豪门
0
11-06 23:38
16.穿越
191
12-02 23:37
17.穿越
412
10-13 22:39
18.豪门
635
07-01 13:15
19.架空
144
06-18 09:35
20.宅斗
1032
08-15 19:03
21.宫斗
0
09-30 20:32
22.豪门
0
06-05 11:31
23.重生
80
11-25 19:56
24.异世
68
01-12 10:06
25.豪门
0
05-29 18:46
26.婚恋
2778
11-04 17:48
27.玄奇
207
12-06 16:57
28.穿越
260
01-04 23:26
29.豪门
0
12-07 21:39
30.架空
1127
06-06 17:28
31.穿越
113
09-13 09:06
32.架空
597
02-14 18:47
33.玄奇
528
06-04 22:04
34.穿越
328
06-06 22:09
35.架空
539
05-24 14:42
36.架空
0
03-05 23:27
37.穿越
3215
08-21 16:38
38.宫斗
905
08-04 20:24
39.玄奇
1328
07-25 10:58
40.穿越
203
01-27 20:53
41.宫斗
407
08-31 09:03
42.宅斗
16
05-03 17:38
43.豪门
0
11-10 08:00
44.婚恋
0
07-12 21:37
45.架空
0
06-23 21:02
46.玄奇
1382
05-31 20:36
47.重生
334
07-16 19:19
48.婚恋
505
11-01 16:42
49.婚恋
0
10-19 18:32
50.豪门
540
09-19 19:18
51.婚恋
226
03-18 13:09
52.穿越
1026
03-08 16:28
53.重生
304
02-19 10:25
54.玄奇
2617
02-15 20:57
55.穿越
199
09-04 19:43
56.同人
768
07-19 20:00
57.宅斗
0
02-13 18:13
58.豪门
0
11-12 22:23
59.架空
0
07-28 23:42
60.婚恋
0
02-03 23:09
61.豪门
285
01-07 19:21
62.重生
654
10-12 18:16
63.异能
617
06-18 20:23
64.宫斗
27
06-02 21:05
65.种田
206
08-31 19:23
66.宅斗
2444
08-19 15:51
67.宅斗
818
08-07 23:38
68.现代
0
12-23 17:02
69.玄奇
0
07-23 12:00
70.婚恋
0
11-01 16:43
71.豪门
0
09-12 00:01
72.架空
0
04-27 22:42
73.豪门
0
04-19 13:55
74.异能
62
07-30 00:00
75.穿越
1307
07-20 16:41
76.玄奇
12820
07-15 23:46
77.架空
828
06-06 17:54
78.宅斗
985
05-20 23:53
79.玄奇
4960
04-12 15:58
80.玄奇
245
03-02 23:11
81.宅斗
34
12-21 10:11
82.宅斗
1411
07-21 00:00
83.现代
0
07-31 10:10
84.玄奇
0
06-18 13:53
85.架空
0
12-03 23:41
86.玄奇
0
11-28 22:13
87.豪门
0
11-07 22:48
88.婚恋
0
08-29 23:15
89.种田
1831
08-21 16:38
90.豪门
0
07-11 21:25
91.豪门
0
06-13 15:37
92.豪门
0
05-07 22:10
93.豪门
0
02-28 00:01
94.豪门
304
12-16 07:30
95.婚恋
669
11-07 18:16
96.仙侠
54
09-25 19:51
97.豪门
655
08-31 13:02
98.现实
374
06-29 09:55
99.穿越
373
06-19 18:07
100.婚恋
159
06-04 21:05
"; + + @Test + public void testStringAPI() { + // testAPI: selectList(String) -> selectList(Node) + List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectList(rank); + Assert.assertSame(100, items.size()); + // testAPI: select(String) -> select(Node) + String name = new Xpath2Selector("//td[3]/div/a[1]/text()").select(items.get(10)); + Assert.assertEquals("深宫安容传", name); + } + + @Test + public void testNodeAPI() { + // testAPI: selectNodes(String) -> selectNodes(Node) + List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectNodes(rank); + Assert.assertSame(100, items.size()); + // testAPI: selectNode(Node) + Node item = new Xpath2Selector("./td[3]/div/a[1]").selectNode(items.get(10)); + String name = new Xpath2Selector("./text()").select(item); + Assert.assertEquals("深宫安容传", name); + } + + @Test + public void testUtilAPI() throws TransformerException { + Node item = Xpath2Selector.newInstance("//div[@class=\"bd\"]//tbody/tr[11]/td[3]/div/a[1]/text()").selectNode(rank); + // testAPI: nodeToString(Node) -> nodesToStrings(List) + String name = JaxpSelectorUtils.nodeToString(item); + Assert.assertEquals("深宫安容传", name); } } diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml old mode 100755 new mode 100644 index 9f4219d6c..62cea3e69 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -1,18 +1,31 @@ - + - webmagic-parent us.codecraft - 0.7.3 + webmagic + 1.0.3 4.0.0 webmagic-scripts - 1.1.2-2 + 2.1.0 + + org.apache.logging.log4j + log4j-core + + + org.apache.logging.log4j + log4j-slf4j2-impl + org.jruby jruby @@ -22,10 +35,6 @@ kotlin-stdlib ${kotlin.version} - - org.codehaus.groovy - groovy-all - org.python jython @@ -34,25 +43,22 @@ commons-cli commons-cli - - junit - junit - test - ${project.groupId} webmagic-core ${project.version} - - org.slf4j - slf4j-log4j12 - ${project.groupId} webmagic-extension ${project.version} + + org.projectlombok + lombok + 1.18.32 + provided + @@ -94,4 +100,4 @@ - \ No newline at end of file + diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java new file mode 100644 index 000000000..873176e6e --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java @@ -0,0 +1,47 @@ +package us.codecraft.webmagic.scripts; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import lombok.Getter; +import lombok.Setter; +import us.codecraft.webmagic.scripts.languages.JRuby; +import us.codecraft.webmagic.scripts.languages.Javascript; +import us.codecraft.webmagic.scripts.languages.Language; +import us.codecraft.webmagic.utils.WMCollections; + +public class Params { + @Getter + Language language = new Javascript(); + + @Getter @Setter + String scriptFileName; + + @Getter @Setter + List urls; + + @Getter @Setter + int thread = 1; + + @Getter @Setter + int sleepTime = 1000; + + private static Map> alias; + + public Params() { + alias = new HashMap>(); + alias.put(new Javascript(), WMCollections.newHashSet("js", "javascript", "JavaScript", "JS")); + alias.put(new JRuby(), WMCollections.newHashSet("ruby", "jruby", "Ruby", "JRuby")); + } + + public void setLanguagefromArg(String arg) { + for (Map.Entry> languageSetEntry : alias.entrySet()) { + if (languageSetEntry.getValue().contains(arg)) { + this.language = languageSetEntry.getKey(); + return; + } + } + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java index 0423e58e1..c60b3ec3d 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java @@ -1,88 +1,21 @@ package us.codecraft.webmagic.scripts; import org.apache.commons.cli.*; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; + import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; +import us.codecraft.webmagic.scripts.config.CommandLineOption; import us.codecraft.webmagic.utils.WMCollections; -import java.util.HashMap; import java.util.List; -import java.util.Map; -import java.util.Set; /** - * @author code4crafter@gmail.com + * @author code4crafter@gmail.com / FrancoisGib * @since 0.4.1 */ public class ScriptConsole { - - private static class Params { - Language language = Language.JavaScript; - String scriptFileName; - List urls; - int thread = 1; - int sleepTime = 1000; - private static Map> alias = new HashMap>(); - - static { - alias.put(Language.JavaScript, WMCollections.newHashSet("js", "javascript", "JavaScript", "JS")); - alias.put(Language.JRuby, WMCollections.newHashSet("ruby", "jruby", "Ruby", "JRuby")); - } - - public void setLanguagefromArg(String arg) { - for (Map.Entry> languageSetEntry : alias.entrySet()) { - if (languageSetEntry.getValue().contains(arg)) { - this.language = languageSetEntry.getKey(); - return; - } - } - } - - private Language getLanguage() { - return language; - } - - private void setLanguage(Language language) { - this.language = language; - } - - private String getScriptFileName() { - return scriptFileName; - } - - private void setScriptFileName(String scriptFileName) { - this.scriptFileName = scriptFileName; - } - - private List getUrls() { - return urls; - } - - private void setUrls(List urls) { - this.urls = urls; - } - - private int getThread() { - return thread; - } - - private void setThread(int thread) { - this.thread = thread; - } - - private int getSleepTime() { - return sleepTime; - } - - private void setSleepTime(int sleepTime) { - this.sleepTime = sleepTime; - } - } - public static void main(String[] args) { Params params = parseCommand(args); startSpider(params); @@ -140,45 +73,9 @@ private static void exit() { private static Params readOptions(CommandLine commandLine) { Params params = new Params(); - if (commandLine.hasOption("l")) { - String language = commandLine.getOptionValue("l"); - params.setLanguagefromArg(language); - } - if (commandLine.hasOption("f")) { - String scriptFilename = commandLine.getOptionValue("f"); - params.setScriptFileName(scriptFilename); - } else { - exit(); - } - if (commandLine.hasOption("s")) { - Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s")); - params.setSleepTime(sleepTime); - } - if (commandLine.hasOption("t")) { - Integer thread = Integer.parseInt(commandLine.getOptionValue("t")); - params.setThread(thread); - } - if (commandLine.hasOption("g")) { - configLogger(commandLine.getOptionValue("g")); - } - params.setUrls(commandLine.getArgList()); + List options = CommandLineOption.getAllOptions(); + for (CommandLineOption option : options) + option.addParamOptionIfInCommandLine(params, commandLine); return params; } - - private static void configLogger(String value) { - Logger rootLogger = Logger.getRootLogger(); - if ("debug".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.DEBUG); - } else if ("info".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.INFO); - } else if ("warn".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.WARN); - } else if ("trace".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.TRACE); - } else if ("off".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.OFF); - } else if ("error".equalsIgnoreCase(value)) { - rootLogger.setLevel(Level.ERROR); - } - } -} +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java index d1e5d7fe8..bdfbbaedb 100755 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java @@ -2,6 +2,9 @@ import javax.script.ScriptEngine; import javax.script.ScriptEngineManager; + +import us.codecraft.webmagic.scripts.languages.Language; + import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.atomic.AtomicInteger; @@ -11,14 +14,11 @@ */ public class ScriptEnginePool { - private final int size; - private final AtomicInteger availableCount; private final LinkedBlockingQueue scriptEngines = new LinkedBlockingQueue(); public ScriptEnginePool(Language language,int size) { - this.size = size; this.availableCount = new AtomicInteger(size); for (int i=0;i getAllOptions() { + return List.of(new OptionL(), new OptionF(), new OptionS(), new OptionT(), new OptionG()); + } +} + +class OptionL extends CommandLineOption { + public OptionL() { + super('l'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + String language = commandLine.getOptionValue("l"); + params.setLanguagefromArg(language); + } +} + +class OptionF extends CommandLineOption { + public OptionF() { + super('f'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + String scriptFilename = commandLine.getOptionValue("f"); + params.setScriptFileName(scriptFilename); + } +} + +class OptionS extends CommandLineOption { + public OptionS() { + super('s'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s")); + params.setSleepTime(sleepTime); + } +} + +class OptionT extends CommandLineOption { + public OptionT() { + super('t'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + Integer thread = Integer.parseInt(commandLine.getOptionValue("t")); + params.setThread(thread); + } +} + +class OptionG extends CommandLineOption { + public OptionG() { + super('g'); + } + + protected void addParamOption(Params params, CommandLine commandLine) { + ConfigLogger.configLogger(commandLine.getOptionValue("g")); + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java new file mode 100644 index 000000000..9e81ea6c7 --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.scripts.config; + +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; +import org.apache.logging.log4j.Level; +import org.apache.logging.log4j.core.Logger; +import org.slf4j.LoggerFactory; + +public class ConfigLogger { + /** + * Log the config parameter. If the counter is less than the number of available + * options then it means that the user entered an option + * + * @param value The config string + */ + public static void configLogger(String value) { + List> options = List.of( + Pair.of("debug", Level.DEBUG), + Pair.of("info", Level.INFO), + Pair.of("warn", Level.WARN), + Pair.of("trace", Level.TRACE), + Pair.of("off", Level.OFF), + Pair.of("error", Level.ERROR)); + Pair option = options.get(0); + int i = 1; + while (i < options.size() && !option.getLeft().equalsIgnoreCase(value)) + option = options.get(i++); + if (i < options.size()) { + Logger rootLogger = (Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME); + rootLogger.setLevel(option.getRight()); + } + } +} diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java new file mode 100644 index 000000000..b3a3209a5 --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java @@ -0,0 +1,26 @@ +package us.codecraft.webmagic.scripts.languages; + +import java.util.Iterator; +import java.util.Map; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; + +import org.jruby.RubyHash; + +import us.codecraft.webmagic.Page; + +public class JRuby extends Language { + public JRuby() { + super("jruby","ruby/defines.rb",""); + } + + public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException { + RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, engine.getContext()); + Iterator itruby = oRuby.entrySet().iterator(); + while (itruby.hasNext()) { + Map.Entry pairs = (Map.Entry) itruby.next(); + page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); + } + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java new file mode 100644 index 000000000..b0f7b647a --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java @@ -0,0 +1,16 @@ +package us.codecraft.webmagic.scripts.languages; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; + +import us.codecraft.webmagic.Page; + +public class Javascript extends Language { + public Javascript() { + super("javascript","js/defines.js",""); + } + + public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException { + engine.eval(defines + "\n" + script, engine.getContext()); + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java new file mode 100644 index 000000000..9124d2dbb --- /dev/null +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java @@ -0,0 +1,27 @@ +package us.codecraft.webmagic.scripts.languages; + +import java.util.Iterator; +import java.util.Map; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; + +import org.python.core.PyDictionary; + +import us.codecraft.webmagic.Page; + +public class Jython extends Language { + public Jython() { + super("jython","python/defines.py",""); + } + + public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException { + engine.eval(defines + "\n" + script, engine.getContext()); + PyDictionary oJython = (PyDictionary) engine.get("result"); + Iterator it = oJython.entrySet().iterator(); + while (it.hasNext()) { + Map.Entry pairs = (Map.Entry) it.next(); + page.getResultItems().put(pairs.getKey().toString(), pairs.getValue()); + } + } +} \ No newline at end of file diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java old mode 100755 new mode 100644 similarity index 51% rename from webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java rename to webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java index 2f9d22d57..44e6ba0a0 --- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java +++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java @@ -1,15 +1,18 @@ -package us.codecraft.webmagic.scripts; +package us.codecraft.webmagic.scripts.languages; + +import javax.script.ScriptEngine; +import javax.script.ScriptException; +import us.codecraft.webmagic.Page; /** - * @author code4crafter@gmail.com + * @author FrancoisGib */ -public enum Language { - - JavaScript("javascript","js/defines.js",""), - - JRuby("jruby","ruby/defines.rb",""), - - Jython("jython","python/defines.py",""); +public abstract class Language { + public Language(String engineName, String defineFile, String gatherFile) { + this.engineName = engineName; + this.defineFile = defineFile; + this.gatherFile = gatherFile; + } private String engineName; @@ -17,12 +20,6 @@ public enum Language { private String gatherFile; - Language(String engineName, String defineFile, String gatherFile) { - this.engineName = engineName; - this.defineFile = defineFile; - this.gatherFile = gatherFile; - } - public String getEngineName() { return engineName; } @@ -34,4 +31,6 @@ public String getDefineFile() { public String getGatherFile() { return gatherFile; } + + public abstract void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException; } diff --git a/webmagic-scripts/src/main/resources/log4j.xml b/webmagic-scripts/src/main/resources/log4j.xml deleted file mode 100755 index 474269cb1..000000000 --- a/webmagic-scripts/src/main/resources/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java index ffeb9c993..b4c28521f 100755 --- a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java +++ b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java @@ -2,7 +2,11 @@ import org.junit.Ignore; import org.junit.Test; + import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.scripts.languages.JRuby; +import us.codecraft.webmagic.scripts.languages.Javascript; +import us.codecraft.webmagic.scripts.languages.Jython; /** * @author code4crafter@gmail.com @@ -13,14 +17,14 @@ public class ScriptProcessorTest { @Test public void testJavaScriptProcessor() { - ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JavaScript).scriptFromClassPathFile("js/oschina.js").build(); + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Javascript()).scriptFromClassPathFile("js/oschina.js").build(); pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } @Test public void testRubyProcessor() { - ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JRuby).scriptFromClassPathFile("ruby/oschina.rb").build(); + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new JRuby()).scriptFromClassPathFile("ruby/oschina.rb").build(); pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } @@ -28,7 +32,7 @@ public void testRubyProcessor() { @Test public void testPythonProcessor() { - ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.Jython).scriptFromClassPathFile("python/oschina.py").build(); + ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Jython()).scriptFromClassPathFile("python/oschina.py").build(); pageProcessor.getSite().setSleepTime(0); Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); } diff --git a/webmagic-scripts/src/test/resouces/log4j.xml b/webmagic-scripts/src/test/resouces/log4j.xml deleted file mode 100755 index 1f64d8dad..000000000 --- a/webmagic-scripts/src/test/resouces/log4j.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/webmagic-scripts/src/test/resources/log4j2-test.xml b/webmagic-scripts/src/test/resources/log4j2-test.xml new file mode 100644 index 000000000..e2fab6602 --- /dev/null +++ b/webmagic-scripts/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index dfc4a1958..16214c61a 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -1,45 +1,46 @@ - - - webmagic-parent - us.codecraft - 0.7.3 - - 4.0.0 + + + us.codecraft + webmagic + 1.0.3 + + 4.0.0 - webmagic-selenium + webmagic-selenium - - - org.seleniumhq.selenium - selenium-java - - - ${project.groupId} - webmagic-core - ${project.version} - - - com.github.detro - phantomjsdriver - - - junit - junit - - + + + org.seleniumhq.selenium + selenium-java + + + ${project.groupId} + webmagic-core + ${project.version} + + + com.github.detro + phantomjsdriver + + - - - - org.apache.maven.plugins - maven-deploy-plugin - 3.0.0-M1 - - true - - - - + + + + org.apache.maven.plugins + maven-deploy-plugin + 3.0.0-M1 + + true + + + + diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index cce293fc9..f6d2574fb 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -11,12 +11,14 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.downloader.AbstractDownloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; +import us.codecraft.webmagic.utils.HttpConstant; import java.io.Closeable; import java.io.IOException; +import java.net.http.HttpRequest; import java.util.Map; /** @@ -24,112 +26,121 @@ * 需要下载Selenium driver支持。
* * @author code4crafter@gmail.com
- * Date: 13-7-26
- * Time: 下午1:37
+ * Date: 13-7-26
+ * Time: 下午1:37
*/ -public class SeleniumDownloader implements Downloader, Closeable { - - private volatile WebDriverPool webDriverPool; - - private Logger logger = LoggerFactory.getLogger(getClass()); - - private int sleepTime = 0; - - private int poolSize = 1; - - private static final String DRIVER_PHANTOMJS = "phantomjs"; - - /** - * 新建 - * - * @param chromeDriverPath chromeDriverPath - */ - public SeleniumDownloader(String chromeDriverPath) { - System.getProperties().setProperty("webdriver.chrome.driver", - chromeDriverPath); - } - - /** - * Constructor without any filed. Construct PhantomJS browser - * - * @author bob.li.0718@gmail.com - */ - public SeleniumDownloader() { - // System.setProperty("phantomjs.binary.path", - // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); - } - - /** - * set sleep time to wait until load success - * - * @param sleepTime sleepTime - * @return this - */ - public SeleniumDownloader setSleepTime(int sleepTime) { - this.sleepTime = sleepTime; - return this; - } - - @Override - public Page download(Request request, Task task) { - checkInit(); - WebDriver webDriver; - try { - webDriver = webDriverPool.get(); - } catch (InterruptedException e) { - logger.warn("interrupted", e); - return null; - } - logger.info("downloading page " + request.getUrl()); - webDriver.get(request.getUrl()); - try { - Thread.sleep(sleepTime); - } catch (InterruptedException e) { - e.printStackTrace(); - } - WebDriver.Options manage = webDriver.manage(); - Site site = task.getSite(); - if (site.getCookies() != null) { - for (Map.Entry cookieEntry : site.getCookies() - .entrySet()) { - Cookie cookie = new Cookie(cookieEntry.getKey(), - cookieEntry.getValue()); - manage.addCookie(cookie); - } - } - - /* - * TODO You can add mouse event or other processes - * - * @author: bob.li.0718@gmail.com - */ - - WebElement webElement = webDriver.findElement(By.xpath("/html")); - String content = webElement.getAttribute("outerHTML"); - Page page = new Page(); - page.setRawText(content); - page.setHtml(new Html(content, request.getUrl())); - page.setUrl(new PlainText(request.getUrl())); - page.setRequest(request); - webDriverPool.returnToPool(webDriver); - return page; - } - - private void checkInit() { - if (webDriverPool == null) { - synchronized (this) { - webDriverPool = new WebDriverPool(poolSize); - } - } - } - - @Override - public void setThread(int thread) { - this.poolSize = thread; - } - - @Override - public void close() throws IOException { - webDriverPool.closeAll(); - } +public class SeleniumDownloader extends AbstractDownloader implements Closeable { + + private volatile WebDriverPool webDriverPool; + + private Logger logger = LoggerFactory.getLogger(getClass()); + + private int sleepTime = 0; + + private int poolSize = 1; + + private static final String DRIVER_PHANTOMJS = "phantomjs"; + + /** + * 新建 + * + * @param chromeDriverPath chromeDriverPath + */ + public SeleniumDownloader(String chromeDriverPath) { + System.getProperties().setProperty("webdriver.chrome.driver", + chromeDriverPath); + } + + /** + * Constructor without any filed. Construct PhantomJS browser + * + * @author bob.li.0718@gmail.com + */ + public SeleniumDownloader() { + // System.setProperty("phantomjs.binary.path", + // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); + } + + /** + * set sleep time to wait until load success + * + * @param sleepTime sleepTime + * @return this + */ + public SeleniumDownloader setSleepTime(int sleepTime) { + this.sleepTime = sleepTime; + return this; + } + + @Override + public Page download(Request request, Task task) { + checkInit(); + WebDriver webDriver = null; + Page page = Page.fail(request); + try { + webDriver = webDriverPool.get(); + + logger.info("downloading page " + request.getUrl()); + webDriver.get(request.getUrl()); + try { + if (sleepTime > 0) { + Thread.sleep(sleepTime); + } + } catch (InterruptedException e) { + e.printStackTrace(); + } + WebDriver.Options manage = webDriver.manage(); + Site site = task.getSite(); + if (site.getCookies() != null) { + for (Map.Entry cookieEntry : site.getCookies() + .entrySet()) { + Cookie cookie = new Cookie(cookieEntry.getKey(), + cookieEntry.getValue()); + manage.addCookie(cookie); + } + } + + /* + * TODO You can add mouse event or other processes + * + * @author: bob.li.0718@gmail.com + */ + + WebElement webElement = webDriver.findElement(By.xpath("/html")); + String content = webElement.getAttribute("outerHTML"); + page.setDownloadSuccess(true); + page.setRawText(content); + page.setHtml(new Html(content, request.getUrl())); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + page.setStatusCode(HttpConstant.StatusCode.CODE_200); + onSuccess(page, task); + } catch (Exception e) { + logger.warn("download page {} error", request.getUrl(), e); + onError(page, task, e); + } finally { + if (webDriver != null) { + webDriverPool.returnToPool(webDriver); + } + } + return page; + } + + private void checkInit() { + if (webDriverPool == null) { + synchronized (this) { + webDriverPool = new WebDriverPool(poolSize); + } + } + } + + @Override + public void setThread(int thread) { + this.poolSize = thread; + } + + @Override + public void close() throws IOException { + webDriverPool.closeAll(); + } } diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java index e1d9dd039..b96d2894b 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java @@ -1,15 +1,5 @@ package us.codecraft.webmagic.downloader.selenium; -import org.openqa.selenium.WebDriver; -import org.openqa.selenium.chrome.ChromeDriver; -import org.openqa.selenium.firefox.FirefoxDriver; -import org.openqa.selenium.phantomjs.PhantomJSDriver; -import org.openqa.selenium.phantomjs.PhantomJSDriverService; -import org.openqa.selenium.remote.DesiredCapabilities; -import org.openqa.selenium.remote.RemoteWebDriver; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.io.FileReader; import java.io.IOException; import java.net.MalformedURLException; @@ -22,6 +12,18 @@ import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.atomic.AtomicInteger; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; +import org.openqa.selenium.firefox.FirefoxDriver; +import org.openqa.selenium.firefox.FirefoxOptions; +import org.openqa.selenium.phantomjs.PhantomJSDriver; +import org.openqa.selenium.phantomjs.PhantomJSDriverService; +import org.openqa.selenium.remote.DesiredCapabilities; +import org.openqa.selenium.remote.RemoteWebDriver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + /** * @author code4crafter@gmail.com
* Date: 13-7-26
@@ -58,7 +60,7 @@ class WebDriverPool { * Configure the GhostDriver, and initialize a WebDriver instance. This part * of code comes from GhostDriver. * https://github.com/detro/ghostdriver/tree/master/test/java/src/test/java/ghostdriver - * + * * @author bob.li.0718@gmail.com * @throws IOException */ @@ -73,7 +75,6 @@ public void configure() throws IOException { // Prepare capabilities sCaps = new DesiredCapabilities(); - sCaps.setJavascriptEnabled(true); sCaps.setCapability("takesScreenshot", false); String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS); @@ -134,9 +135,9 @@ public void configure() throws IOException { sCaps.setBrowserName("phantomjs"); mDriver = new RemoteWebDriver(new URL(driver), sCaps); } else if (driver.equals(DRIVER_FIREFOX)) { - mDriver = new FirefoxDriver(sCaps); + mDriver = new FirefoxDriver(new FirefoxOptions(sCaps)); } else if (driver.equals(DRIVER_CHROME)) { - mDriver = new ChromeDriver(sCaps); + mDriver = new ChromeDriver(new ChromeOptions().merge(sCaps)); } else if (driver.equals(DRIVER_PHANTOMJS)) { mDriver = new PhantomJSDriver(sCaps); } @@ -144,7 +145,7 @@ public void configure() throws IOException { /** * check whether input is a valid URL - * + * * @author bob.li.0718@gmail.com * @param urlString urlString * @return true means yes, otherwise no. @@ -178,7 +179,7 @@ public WebDriverPool() { } /** - * + * * @return * @throws InterruptedException */ diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java index b7bcd80b3..43ac84b5a 100644 --- a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java @@ -1,17 +1,18 @@ package us.codecraft.webmagic.downloader; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + import org.junit.Ignore; import org.junit.Test; import org.openqa.selenium.By; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; import org.openqa.selenium.remote.DesiredCapabilities; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; - /** * @author code4crafter@gmail.com
* Date: 13-7-26
@@ -29,10 +30,10 @@ public void testSelenium() { Map preferences = new HashMap(); preferences.put("profile.default_content_settings", contentSettings); - DesiredCapabilities caps = DesiredCapabilities.chrome(); + DesiredCapabilities caps = new DesiredCapabilities(); caps.setCapability("chrome.prefs", preferences); caps.setCapability("chrome.switches", Arrays.asList("--user-data-dir=/Users/yihua/temp/chrome")); - WebDriver webDriver = new ChromeDriver(caps); + WebDriver webDriver = new ChromeDriver(new ChromeOptions().merge(caps)); webDriver.get("http://huaban.com/"); WebElement webElement = webDriver.findElement(By.xpath("/html")); System.out.println(webElement.getAttribute("outerHTML"));