"))
+ || ((text.startsWith("") || text.startsWith(" "))) {
+ text = "";
+ }
+ return text;
+ }
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java
index ccf00a466..63bb4c110 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java
@@ -21,6 +21,10 @@ public abstract class CharsetUtils {
private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class);
+ private CharsetUtils() {
+ throw new AssertionError("No us.codecraft.webmagic.utils.CharsetUtils instances for you!");
+ }
+
public static String detectCharset(String contentType, byte[] contentBytes) throws IOException {
String charset;
// charset
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java
index 55e185105..fbeb8ed3b 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java
@@ -6,12 +6,6 @@
public abstract class NumberUtils {
public static int compareLong(long o1, long o2) {
- if (o1 < o2) {
- return -1;
- } else if (o1 == o2) {
- return 0;
- } else {
- return 1;
- }
+ return Long.compare(o1, o2);
}
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
index c61483a39..ea317c405 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
@@ -116,6 +116,10 @@ public static List convertToUrls(Collection requests) {
private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)", Pattern.CASE_INSENSITIVE);
public static String getCharset(String contentType) {
+ if (contentType == null) {
+ return null;
+ }
+
Matcher matcher = patternForCharset.matcher(contentType);
if (matcher.find()) {
String charset = matcher.group(1);
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java
index 23e1644ce..a2ca5afd0 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/WMCollections.java
@@ -21,10 +21,10 @@ public static Set newHashSet(T... t){
}
public static List newArrayList(T... t){
- List set = new ArrayList(t.length);
+ List list = new ArrayList(t.length);
for (T t1 : t) {
- set.add(t1);
+ list.add(t1);
}
- return set;
+ return list;
}
}
diff --git a/webmagic-core/src/main/resources/log4j.xml b/webmagic-core/src/main/resources/log4j.xml
deleted file mode 100644
index c2b5a2f53..000000000
--- a/webmagic-core/src/main/resources/log4j.xml
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java
index c7e4943d9..b8f699a6f 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java
@@ -1,9 +1,13 @@
package us.codecraft.webmagic;
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.util.Collections;
+import java.util.Map;
+
import org.junit.Test;
-import us.codecraft.webmagic.utils.HttpConstant;
-import static org.assertj.core.api.Assertions.assertThat;
+import us.codecraft.webmagic.utils.HttpConstant;
/**
* @author code4crafter@gmail.com
@@ -22,4 +26,28 @@ public void testEqualsAndHashCode() throws Exception {
assertThat(requestA).isNotEqualTo(requestB);
assertThat(requestA.hashCode()).isNotEqualTo(requestB.hashCode());
}
+
+ @Test
+ public void testSetExtras() {
+ Request request = new Request();
+ Map extras = Collections.singletonMap("a", "1");
+ request.setExtras(extras);
+ request.putExtra("b", "2");
+ assertThat(request.getExtra("a")).isEqualTo("1");
+ assertThat(request.getExtra("b")).isEqualTo("2");
+ }
+
+ @Test
+ public void testGetExtras() {
+ Request request = new Request();
+ request.putExtra("a", "1");
+ assertThat(request.getExtras()).containsEntry("a", "1");
+ }
+
+ @Test(expected = UnsupportedOperationException.class)
+ public void testGetExtrasShouldBeUnmodifiable() {
+ Request request = new Request();
+ request.getExtras().put("a", "1");
+ }
+
}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java
new file mode 100644
index 000000000..47c4fcc14
--- /dev/null
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java
@@ -0,0 +1,40 @@
+package us.codecraft.webmagic;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.junit.Test;
+
+public class SiteTest {
+
+ @Test
+ public void test() {
+ Site site = Site.me().setDefaultCharset(StandardCharsets.UTF_8.name());
+ assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset());
+ }
+
+ @Test
+ public void addCookieTest(){
+ Site site=Site.me().setDefaultCharset(StandardCharsets.UTF_8.name());
+ site.addCookie("cookieDefault","cookie-webmagicDefault");
+ String firstDomain="example.com";
+ String secondDomain="exampleCopy.com";
+ site.addCookie(firstDomain, "cookie", "cookie-webmagic");
+ site.addCookie(firstDomain, "cookieCopy", "cookie-webmagicCopy");
+ site.addCookie(secondDomain, "cookie", "cookie-webmagic");
+ Map> allCookies = site.getAllCookies();
+ List domains=new ArrayList<>();
+ for(String key : allCookies.keySet()){
+ domains.add(key);
+ }
+ assertEquals("cookie-webmagic", allCookies.get(firstDomain).get("cookie"));
+ assertEquals("cookie-webmagicCopy", allCookies.get(firstDomain).get("cookieCopy"));
+ assertEquals("cookie-webmagic", allCookies.get(secondDomain).get("cookie"));
+ assertEquals(2, domains.size());
+ }
+}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
index ece060003..1ff7b4dd7 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
@@ -1,9 +1,10 @@
package us.codecraft.webmagic.downloader;
-import com.github.dreamhead.moco.HttpServer;
-import com.github.dreamhead.moco.Runnable;
-import com.github.dreamhead.moco.Runner;
-import org.apache.commons.collections.map.HashedMap;
+
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.util.Map;
+import org.apache.commons.collections4.map.HashedMap;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpUriRequest;
@@ -11,6 +12,9 @@
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.junit.Test;
+import com.github.dreamhead.moco.HttpServer;
+import com.github.dreamhead.moco.Runnable;
+import com.github.dreamhead.moco.Runner;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
@@ -21,14 +25,22 @@
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpConstant;
-
-import java.io.IOException;
-import java.io.UnsupportedEncodingException;
-import java.util.Map;
-
-import static com.github.dreamhead.moco.Moco.*;
+import static com.github.dreamhead.moco.Moco.and;
+import static com.github.dreamhead.moco.Moco.by;
+import static com.github.dreamhead.moco.Moco.cookie;
+import static com.github.dreamhead.moco.Moco.eq;
+import static com.github.dreamhead.moco.Moco.form;
+import static com.github.dreamhead.moco.Moco.header;
+import static com.github.dreamhead.moco.Moco.httpServer;
+import static com.github.dreamhead.moco.Moco.method;
+import static com.github.dreamhead.moco.Moco.not;
+import static com.github.dreamhead.moco.Moco.query;
+import static com.github.dreamhead.moco.Moco.text;
+import static com.github.dreamhead.moco.Moco.uri;
+import static com.github.dreamhead.moco.Moco.with;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertThrows;
import static org.junit.Assert.assertTrue;
/**
@@ -322,5 +334,13 @@ public void run() throws Exception {
});
}
+ @Test
+ public void test_no_task_download(){
+ Request request = new Request();
+ request.setUrl("http://127.0.0.1:13423/");
+ HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
+ assertThrows(NullPointerException.class, () -> httpClientDownloader.download(request,null));
+ }
+
}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
index 3aa742c10..58dd3a6fa 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
@@ -1,13 +1,15 @@
package us.codecraft.webmagic.downloader;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
import org.apache.commons.io.IOUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.PlainText;
-import java.io.IOException;
-import java.io.InputStream;
/**
* @author code4crafter@gmail.com
@@ -19,7 +21,7 @@ public Page download(Request request, Task task) {
Page page = new Page();
InputStream resourceAsStream = this.getClass().getResourceAsStream("/html/mock-github.html");
try {
- page.setRawText(IOUtils.toString(resourceAsStream));
+ page.setRawText(IOUtils.toString(resourceAsStream, Charset.defaultCharset()));
} catch (IOException e) {
e.printStackTrace();
}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java
new file mode 100644
index 000000000..ebb1225cc
--- /dev/null
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/processor/PageProcessorTest.java
@@ -0,0 +1,40 @@
+package us.codecraft.webmagic.processor;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+
+public class PageProcessorTest {
+
+ @Test
+ public void testGetSite() {
+ Site actualSite = new PageProcessor() {
+
+ @Override
+ public void process(Page page) {
+ }
+
+ }.getSite();
+
+ assertEquals(Site.me(), actualSite);
+
+ actualSite = new PageProcessor() {
+
+ @Override
+ public void process(Page page) {
+ }
+
+ @Override
+ public Site getSite() {
+ return Site.me().setTimeOut(123);
+ };
+
+ }.getSite();
+
+ assertEquals(Site.me().setTimeOut(123), actualSite);
+ }
+
+}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java
index 8e4c82026..61fc6ab8b 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java
@@ -8,19 +8,19 @@
import java.util.List;
import org.apache.http.HttpHost;
-import org.junit.BeforeClass;
-import org.junit.Test;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
/**
* @author yxssfxwzy@sina.com May 30, 2014
- *
+ *
*/
-public class ProxyTest {
+class ProxyTest {
private static List httpProxyList = new ArrayList();
- @BeforeClass
- public static void before() {
+ @BeforeAll
+ static void before() {
// String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0",
// "0.0.0.4:0" };
String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" };
@@ -48,7 +48,7 @@ public void run() {
}
@Test
- public void testCreate() {
+ void testCreate() {
Proxy proxy = Proxy.create(URI.create("//127.0.0.1:8080"));
assertNull(proxy.getScheme());
assertNull(proxy.getUsername());
@@ -86,7 +86,15 @@ public void testCreate() {
}
@Test
- public void testToString() {
+ void testEqualsHashCode() {
+ var proxy0 = new Proxy("::1", 1080);
+ var proxy1 = new Proxy("::1", 1080);
+ assertEquals(proxy0, proxy1);
+ assertEquals(proxy0.hashCode(), proxy1.hashCode());
+ }
+
+ @Test
+ void testToString() {
assertEquals("//127.0.0.1:8080", new Proxy("127.0.0.1", 8080).toString());
assertEquals("http://127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "http").toString());
assertEquals("//username:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", "password").toString());
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java
index 6495b16bf..e9325a7a7 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java
@@ -1,6 +1,9 @@
package us.codecraft.webmagic.proxy;
import org.junit.Test;
+import org.mockito.Mockito;
+
+import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
@@ -20,11 +23,12 @@ public void test_get_proxy() throws Exception {
Proxy originProxy1 = new Proxy("127.0.0.1", 1087);
Proxy originProxy2 = new Proxy("127.0.0.1", 1088);
SimpleProxyProvider proxyProvider = SimpleProxyProvider.from(originProxy1, originProxy2);
- Proxy proxy = proxyProvider.getProxy(TASK);
+ Request request = Mockito.mock(Request.class);
+ Proxy proxy = proxyProvider.getProxy(request, TASK);
assertThat(proxy).isEqualTo(originProxy1);
- proxy = proxyProvider.getProxy(TASK);
+ proxy = proxyProvider.getProxy(request, TASK);
assertThat(proxy).isEqualTo(originProxy2);
- proxy = proxyProvider.getProxy(TASK);
+ proxy = proxyProvider.getProxy(request, TASK);
assertThat(proxy).isEqualTo(originProxy1);
}
}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java
new file mode 100644
index 000000000..59885ebd1
--- /dev/null
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/AndSelectorTest.java
@@ -0,0 +1,59 @@
+package us.codecraft.webmagic.selector;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Test;
+
+public class AndSelectorTest {
+
+ @Test
+ public void testSelectList() {
+ String htmlContent = "\n" +
+ "\n" +
+ "\n" +
+ " \n" +
+ " \n" +
+ " HTML with XPath \n" +
+ "\n" +
+ "\n" +
+ " \n" +
+ "
Item 1
\n" +
+ "
Item 2
\n" +
+ "
\n" +
+ "\n" +
+ "";
+ List selectors = new ArrayList();
+ selectors.add(new CssSelector("div"));
+ selectors.add(new XpathSelector("//div[@class='item1']"));
+ AndSelector andSelector = new AndSelector(selectors);
+ List result = andSelector.selectList(htmlContent);
+ assertEquals("\n Item 1\n
", result.get(0));
+ }
+
+ @Test
+ public void testSelectList_NoResults() {
+ String htmlContent = "\n" +
+ "\n" +
+ "\n" +
+ " \n" +
+ " \n" +
+ " HTML with XPath \n" +
+ "\n" +
+ "\n" +
+ " \n" +
+ "
Item 1
\n" +
+ "
Item 2
\n" +
+ "
\n" +
+ "\n" +
+ "";
+ List selectors = new ArrayList();
+ selectors.add(new CssSelector("div"));
+ selectors.add(new XpathSelector("//div[@class='item']"));
+ AndSelector andSelector = new AndSelector(selectors);
+ List result = andSelector.selectList(htmlContent);
+ assertEquals(0, result.size());
+ }
+}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java
new file mode 100644
index 000000000..8b1ace903
--- /dev/null
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/CssSelectorTest.java
@@ -0,0 +1,39 @@
+package us.codecraft.webmagic.selector;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+import org.mockito.runners.MockitoJUnitRunner;
+
+import java.util.List;
+import static org.junit.Assert.*;
+
+public class CssSelectorTest {
+
+ @Test
+ public void testSelectElement() {
+ CssSelector cssSelector = new CssSelector("div");
+ String htmlContent = "Dummy Page Hello World!
";
+ Document doc = Jsoup.parse(htmlContent);
+ Element dummyElement = doc.getElementById("dummyDiv");
+ Element resultElement = cssSelector.selectElement(dummyElement);
+ assertNotNull(resultElement);
+ }
+
+ @Test
+ public void testSelectList() {
+ CssSelector cssSelector = new CssSelector("div");
+ String htmlContent = "Dummy Page Hello World!
";
+ Document doc = Jsoup.parse(htmlContent);
+ Element dummyElement = doc.getElementById("dummyDiv");
+ List result = cssSelector.selectList(dummyElement);
+ assertEquals(1, result.size());
+ assertEquals("[\n Hello World!\n
]", result.toString());
+ }
+
+}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java
new file mode 100644
index 000000000..24d87647c
--- /dev/null
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/OrSelectorTest.java
@@ -0,0 +1,44 @@
+package us.codecraft.webmagic.selector;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Test;
+
+public class OrSelectorTest {
+ @Test
+ public void testSelectList() {
+ String htmlContent = "\n" +
+ "\n" +
+ "\n" +
+ " \n" +
+ " \n" +
+ " HTML with XPath \n" +
+ "\n" +
+ "\n" +
+ " \n" +
+ "
Item 1
\n" +
+ "
Item 2
\n" +
+ "
\n" +
+ "\n" +
+ "";
+ String expectedResult = "[\n" +
+ " \n" +
+ " \n" +
+ " HTML with XPath \n" +
+ ", \n" +
+ " Item 1\n" +
+ "
, \n" +
+ " Item 2\n" +
+ "
]";
+ List selectors = new ArrayList();
+ selectors.add(new CssSelector("head"));
+ selectors.add(new XpathSelector("//div[@class='item1']"));
+ selectors.add(new XpathSelector("//div[@class='item2']"));
+ OrSelector orSelector = new OrSelector(selectors);
+ List result = orSelector.selectList(htmlContent);
+ assertEquals(expectedResult, result.toString());
+ }
+}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java
new file mode 100644
index 000000000..987a6f77a
--- /dev/null
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/CharsetUtilsTest.java
@@ -0,0 +1,16 @@
+package us.codecraft.webmagic.utils;
+
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import java.io.IOException;
+
+import org.junit.jupiter.api.Test;
+
+class CharsetUtilsTest {
+
+ @Test
+ void testDetectCharset() throws IOException {
+ assertNull(CharsetUtils.detectCharset(null, new byte[0]));
+ }
+
+}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java
index 6afdeefe4..38c8295bb 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java
@@ -1,5 +1,7 @@
package us.codecraft.webmagic.utils;
+import static org.junit.Assert.assertNull;
+
import org.junit.Assert;
import org.junit.Test;
@@ -43,5 +45,9 @@ public void testGetDomain(){
Assert.assertEquals("www.dianping.com",UrlUtils.getDomain(url));
}
+ @Test
+ public void testGetCharset() {
+ assertNull(UrlUtils.getCharset(null));
+ }
}
diff --git a/webmagic-core/src/test/resources/log4j.xml b/webmagic-core/src/test/resources/log4j.xml
deleted file mode 100644
index c2b5a2f53..000000000
--- a/webmagic-core/src/test/resources/log4j.xml
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/webmagic-core/src/test/resources/log4j2-test.xml b/webmagic-core/src/test/resources/log4j2-test.xml
new file mode 100644
index 000000000..86aee5f59
--- /dev/null
+++ b/webmagic-core/src/test/resources/log4j2-test.xml
@@ -0,0 +1,16 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml
new file mode 100644
index 000000000..2b4a53460
--- /dev/null
+++ b/webmagic-coverage/pom.xml
@@ -0,0 +1,74 @@
+
+
+ 4.0.0
+
+
+ us.codecraft
+ webmagic
+ 1.0.3
+
+
+ webmagic-coverage
+ pom
+ webmagic-coverage
+ Compute aggregated test code coverage
+
+
+ true
+
+
+
+
+ ${project.groupId}
+ webmagic-core
+ ${project.version}
+
+
+ ${project.groupId}
+ webmagic-extension
+ ${project.version}
+
+
+ ${project.groupId}
+ webmagic-scripts
+ ${project.version}
+
+
+ ${project.groupId}
+ webmagic-selenium
+ ${project.version}
+
+
+ ${project.groupId}
+ webmagic-saxon
+ ${project.version}
+
+
+ ${project.groupId}
+ webmagic-samples
+ ${project.version}
+
+
+
+
+
+
+ org.jacoco
+ jacoco-maven-plugin
+
+
+
+ report-aggregate
+
+
+
+
+
+
+
+
diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml
index bf7ff05d6..93faa4aaf 100644
--- a/webmagic-extension/pom.xml
+++ b/webmagic-extension/pom.xml
@@ -1,19 +1,35 @@
-
+
us.codecraft
- webmagic-parent
- 0.7.3
+ webmagic
+ 1.0.3
4.0.0
webmagic-extension
+
+ org.projectlombok
+ lombok
+ 1.18.32
+ provided
+
redis.clients
jedis
+
+ org.assertj
+ assertj-core
+ test
+
com.google.guava
guava
@@ -24,10 +40,6 @@
webmagic-core
${project.version}
-
- junit
- junit
-
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java
index 6055bdb0f..01f1af9a3 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java
@@ -6,6 +6,7 @@
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.PlainText;
+import us.codecraft.webmagic.utils.HttpConstant;
import java.io.*;
@@ -16,73 +17,70 @@
* @version 0.5.3
*/
public class PhantomJSDownloader extends AbstractDownloader {
-
- private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
+ private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
private static String crawlJsPath;
private static String phantomJsCommand = "phantomjs"; // default
- private int retryNum;
- private int threadNum;
-
public PhantomJSDownloader() {
this.initPhantomjsCrawlPath();
}
-
+
/**
* 添加新的构造函数,支持phantomjs自定义命令
- *
- * example:
- * phantomjs.exe 支持windows环境
- * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
- * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
- *
+ *
+ * example:
+ * phantomjs.exe 支持windows环境
+ * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
+ * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
+ *
* @param phantomJsCommand phantomJsCommand
*/
public PhantomJSDownloader(String phantomJsCommand) {
this.initPhantomjsCrawlPath();
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
}
-
+
/**
* 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
*
* crawl.js start --
- *
+ *
* var system = require('system');
* var url = system.args[1];
- *
+ *
* var page = require('webpage').create();
* page.settings.loadImages = false;
* page.settings.resourceTimeout = 5000;
- *
+ *
* page.open(url, function (status) {
* if (status != 'success') {
* console.log("HTTP request failed!");
* } else {
* console.log(page.content);
* }
- *
+ *
* page.close();
* phantom.exit();
* });
- *
+ *
* -- crawl.js end
*
* 具体项目时可以将以上js代码复制下来使用
- *
+ *
* example:
- * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
- *
+ * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
+ *
* @param phantomJsCommand phantomJsCommand
- * @param crawlJsPath crawlJsPath
+ * @param crawlJsPath crawlJsPath
*/
public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
- PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
- PhantomJSDownloader.crawlJsPath = crawlJsPath;
+ PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
+ PhantomJSDownloader.crawlJsPath = crawlJsPath;
}
-
+
private void initPhantomjsCrawlPath() {
- PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js ";
+ PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath()
+ + System.getProperty("file.separator") + "crawl.js ";
}
@Override
@@ -90,61 +88,41 @@ public Page download(Request request, Task task) {
if (logger.isInfoEnabled()) {
logger.info("downloading page: " + request.getUrl());
}
- String content = getPage(request);
- if (content.contains("HTTP request failed")) {
- for (int i = 1; i <= getRetryNum(); i++) {
- content = getPage(request);
- if (!content.contains("HTTP request failed")) {
- break;
- }
- }
- if (content.contains("HTTP request failed")) {
- //when failed
- Page page = new Page();
+
+ Page page = Page.fail(request);
+ try {
+ String content = getPage(request);
+ if (!content.contains("HTTP request failed")) {
+ page.setDownloadSuccess(true);
+ page.setRawText(content);
+ page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
- return page;
+ page.setStatusCode(HttpConstant.StatusCode.CODE_200);
}
+ onSuccess(page, task);
+ } catch (Exception e) {
+ onError(page, task, e);
+ logger.warn("download page {} error", request.getUrl(), e);
}
-
- Page page = new Page();
- page.setRawText(content);
- page.setUrl(new PlainText(request.getUrl()));
- page.setRequest(request);
- page.setStatusCode(200);
return page;
}
@Override
public void setThread(int threadNum) {
- this.threadNum = threadNum;
+ // ignore
}
- protected String getPage(Request request) {
- try {
- String url = request.getUrl();
- Runtime runtime = Runtime.getRuntime();
- Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
- InputStream is = process.getInputStream();
- BufferedReader br = new BufferedReader(new InputStreamReader(is));
- StringBuffer stringBuffer = new StringBuffer();
- String line;
- while ((line = br.readLine()) != null) {
- stringBuffer.append(line).append("\n");
- }
- return stringBuffer.toString();
- } catch (IOException e) {
- e.printStackTrace();
+ protected String getPage(Request request) throws Exception {
+ String url = request.getUrl();
+ Runtime runtime = Runtime.getRuntime();
+ Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
+ InputStream is = process.getInputStream();
+ BufferedReader br = new BufferedReader(new InputStreamReader(is));
+ StringBuilder builder = new StringBuilder();
+ String line;
+ while ((line = br.readLine()) != null) {
+ builder.append(line).append("\n");
}
-
- return null;
- }
-
- public int getRetryNum() {
- return retryNum;
- }
-
- public PhantomJSDownloader setRetryNum(int retryNum) {
- this.retryNum = retryNum;
- return this;
+ return builder.toString();
}
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java
index f1d2f84d4..673447586 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java
@@ -1,5 +1,9 @@
package us.codecraft.webmagic.model;
+import lombok.Getter;
+import lombok.Setter;
+
+import us.codecraft.webmagic.model.sources.Source;
import us.codecraft.webmagic.selector.Selector;
/**
@@ -7,18 +11,18 @@
* @author code4crafter@gmail.com
* @since 0.2.0
*/
-class Extractor {
+public class Extractor {
+ @Getter @Setter
protected Selector selector;
+ @Getter
protected final Source source;
protected final boolean notNull;
protected final boolean multi;
-
- static enum Source {Html, Url, RawHtml, RawText}
-
+
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
this.selector = selector;
this.source = source;
@@ -26,23 +30,11 @@ public Extractor(Selector selector, Source source, boolean notNull, boolean mult
this.multi = multi;
}
- Selector getSelector() {
- return selector;
- }
-
- Source getSource() {
- return source;
- }
-
- boolean isNotNull() {
+ public boolean isNotNull() {
return notNull;
}
- boolean isMulti() {
+ public boolean isMulti() {
return multi;
}
-
- void setSelector(Selector selector) {
- this.selector = selector;
- }
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java
index a2cba1332..d4cb5937f 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java
@@ -1,58 +1,33 @@
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
+import us.codecraft.webmagic.model.sources.Source;
import us.codecraft.webmagic.selector.Selector;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
+import lombok.Getter;
+import lombok.Setter;
+
/**
* Wrapper of field and extractor.
* @author code4crafter@gmail.com
* @since 0.2.0
*/
-class FieldExtractor extends Extractor {
+public class FieldExtractor extends Extractor {
+ @Getter
private final Field field;
+ @Getter @Setter
private Method setterMethod;
+ @Getter @Setter
private ObjectFormatter objectFormatter;
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) {
super(selector, source, notNull, multi);
this.field = field;
}
-
- Field getField() {
- return field;
- }
-
- Selector getSelector() {
- return selector;
- }
-
- Source getSource() {
- return source;
- }
-
- void setSetterMethod(Method setterMethod) {
- this.setterMethod = setterMethod;
- }
-
- Method getSetterMethod() {
- return setterMethod;
- }
-
- boolean isNotNull() {
- return notNull;
- }
-
- ObjectFormatter getObjectFormatter() {
- return objectFormatter;
- }
-
- void setObjectFormatter(ObjectFormatter objectFormatter) {
- this.objectFormatter = objectFormatter;
- }
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
index 1e25a46c0..751aafe76 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
@@ -3,17 +3,21 @@
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+
+import lombok.Getter;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.annotation.*;
-import us.codecraft.webmagic.model.formatter.ObjectFormatter;
+import us.codecraft.webmagic.model.fields.PageField;
import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder;
+import us.codecraft.webmagic.model.sources.Source;
+import us.codecraft.webmagic.model.sources.SourceTextExtractor;
+import us.codecraft.webmagic.model.sources.Source.*;
import us.codecraft.webmagic.selector.*;
import us.codecraft.webmagic.utils.ClassUtils;
import us.codecraft.webmagic.utils.ExtractorUtils;
import java.lang.annotation.Annotation;
import java.lang.reflect.Field;
-import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.List;
@@ -29,14 +33,19 @@
*/
class PageModelExtractor {
+ @Getter
private List targetUrlPatterns = new ArrayList();
+ @Getter
private Selector targetUrlRegionSelector;
+ @Getter
private List helpUrlPatterns = new ArrayList();
+ @Getter
private Selector helpUrlRegionSelector;
+ @Getter
private Class clazz;
private List fieldExtractors;
@@ -86,7 +95,7 @@ private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) {
regexPattern = ".*";
}
fieldExtractor = new FieldExtractor(field,
- new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(),
+ new RegexSelector(regexPattern), new Url(), extractByUrl.notNull(),
extractByUrl.multi() || List.class.isAssignableFrom(field.getType()));
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
@@ -112,7 +121,7 @@ private FieldExtractor getAnnotationExtractCombo(Class clazz, Field field) {
default:
selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
}
- fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html,
+ fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? new RawHtml() : new SelectedHtml(),
comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType()));
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
@@ -127,26 +136,23 @@ private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) {
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
if (extractBy != null) {
Selector selector = ExtractorUtils.getSelector(extractBy);
- ExtractBy.Source source0 = extractBy.source();
- if (extractBy.type()== ExtractBy.Type.JsonPath){
- source0 = RawText;
- }
- FieldExtractor.Source source = null;
- switch (source0){
+ ExtractBy.Source extractSource = extractBy.source();
+ if (extractBy.type()== ExtractBy.Type.JsonPath)
+ extractSource = RawText;
+ Source source = null;
+ switch (extractSource) {
case RawText:
- source = FieldExtractor.Source.RawText;
+ source = new RawText();
break;
case RawHtml:
- source = FieldExtractor.Source.RawHtml;
+ source = new RawHtml();
break;
case SelectedHtml:
- source =FieldExtractor.Source.Html;
+ source = new SelectedHtml();
break;
default:
- source =FieldExtractor.Source.Html;
-
+ source = new SelectedHtml();
}
-
fieldExtractor = new FieldExtractor(field, selector, source,
extractBy.notNull(), List.class.isAssignableFrom(field.getType()));
fieldExtractor.setSetterMethod(getSetterMethod(clazz, field));
@@ -193,7 +199,7 @@ private void initClassExtractors() {
annotation = clazz.getAnnotation(ExtractBy.class);
if (annotation != null) {
ExtractBy extractBy = (ExtractBy) annotation;
- objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
+ objectExtractor = new Extractor(new XpathSelector(extractBy.value()), new SelectedHtml(), extractBy.notNull(), extractBy.multi());
}
}
@@ -233,135 +239,15 @@ private Object processSingle(Page page, String html, boolean isRaw) {
try {
o = clazz.newInstance();
for (FieldExtractor fieldExtractor : fieldExtractors) {
- if (fieldExtractor.isMulti()) {
- List value;
- switch (fieldExtractor.getSource()) {
- case RawHtml:
- value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
- break;
- case Html:
- if (isRaw) {
- value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
- } else {
- value = fieldExtractor.getSelector().selectList(html);
- }
- break;
- case Url:
- value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
- break;
- case RawText:
- value = fieldExtractor.getSelector().selectList(page.getRawText());
- break;
- default:
- value = fieldExtractor.getSelector().selectList(html);
- }
- if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
- return null;
- }
- if (fieldExtractor.getObjectFormatter() != null) {
- List converted = convert(value, fieldExtractor.getObjectFormatter());
- setField(o, fieldExtractor, converted);
- } else {
- setField(o, fieldExtractor, value);
- }
- } else {
- String value;
- switch (fieldExtractor.getSource()) {
- case RawHtml:
- value = page.getHtml().selectDocument(fieldExtractor.getSelector());
- break;
- case Html:
- if (isRaw) {
- value = page.getHtml().selectDocument(fieldExtractor.getSelector());
- } else {
- value = fieldExtractor.getSelector().select(html);
- }
- break;
- case Url:
- value = fieldExtractor.getSelector().select(page.getUrl().toString());
- break;
- case RawText:
- value = fieldExtractor.getSelector().select(page.getRawText());
- break;
- default:
- value = fieldExtractor.getSelector().select(html);
- }
- if (value == null && fieldExtractor.isNotNull()) {
- return null;
- }
- if (fieldExtractor.getObjectFormatter() != null) {
- Object converted = convert(value, fieldExtractor.getObjectFormatter());
- if (converted == null && fieldExtractor.isNotNull()) {
- return null;
- }
- setField(o, fieldExtractor, converted);
- } else {
- setField(o, fieldExtractor, value);
- }
- }
+ PageField field = SourceTextExtractor.getText(page, html, isRaw, fieldExtractor);
+ if (!field.operation(o, fieldExtractor, logger))
+ return null;
}
- if (AfterExtractor.class.isAssignableFrom(clazz)) {
+ if (AfterExtractor.class.isAssignableFrom(clazz))
((AfterExtractor) o).afterProcess(page);
- }
- } catch (InstantiationException e) {
- logger.error("extract fail", e);
- } catch (IllegalAccessException e) {
- logger.error("extract fail", e);
- } catch (InvocationTargetException e) {
+ } catch (Exception e) {
logger.error("extract fail", e);
}
return o;
}
-
- private Object convert(String value, ObjectFormatter objectFormatter) {
- try {
- Object format = objectFormatter.format(value);
- logger.debug("String {} is converted to {}", value, format);
- return format;
- } catch (Exception e) {
- logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
- }
- return null;
- }
-
- private List convert(List values, ObjectFormatter objectFormatter) {
- List objects = new ArrayList();
- for (String value : values) {
- Object converted = convert(value, objectFormatter);
- if (converted != null) {
- objects.add(converted);
- }
- }
- return objects;
- }
-
- private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
- if (value == null) {
- return;
- }
- if (fieldExtractor.getSetterMethod() != null) {
- fieldExtractor.getSetterMethod().invoke(o, value);
- }
- fieldExtractor.getField().set(o, value);
- }
-
- Class getClazz() {
- return clazz;
- }
-
- List getTargetUrlPatterns() {
- return targetUrlPatterns;
- }
-
- List getHelpUrlPatterns() {
- return helpUrlPatterns;
- }
-
- Selector getTargetUrlRegionSelector() {
- return targetUrlRegionSelector;
- }
-
- Selector getHelpUrlRegionSelector() {
- return helpUrlRegionSelector;
- }
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java
new file mode 100644
index 000000000..4a4bf38a8
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/MultipleField.java
@@ -0,0 +1,42 @@
+package us.codecraft.webmagic.model.fields;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.slf4j.Logger;
+
+import lombok.Getter;
+import us.codecraft.webmagic.model.FieldExtractor;
+import us.codecraft.webmagic.model.formatter.ObjectFormatter;
+
+public class MultipleField extends PageField {
+ @Getter
+ private List fieldNames;
+
+ public MultipleField(List fieldNames) {
+ this.fieldNames = fieldNames;
+ }
+
+ public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException {
+ if ((this.fieldNames == null || this.fieldNames.size() == 0) && fieldExtractor.isNotNull())
+ return false;
+ if (fieldExtractor.getObjectFormatter() != null) {
+ List converted = this.convert(this.fieldNames, fieldExtractor.getObjectFormatter(), logger);
+ setField(o, fieldExtractor, converted);
+ }
+ else
+ setField(o, fieldExtractor, this.fieldNames);
+ return true;
+ }
+
+ private List convert(List values, ObjectFormatter objectFormatter, Logger logger) {
+ List objects = new ArrayList<>();
+ for (String value : values) {
+ Object converted = this.convert(value, objectFormatter, logger);
+ if (converted != null)
+ objects.add(converted);
+ }
+ return objects;
+ }
+}
\ No newline at end of file
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java
new file mode 100644
index 000000000..ad4428335
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/PageField.java
@@ -0,0 +1,31 @@
+package us.codecraft.webmagic.model.fields;
+
+import java.lang.reflect.InvocationTargetException;
+
+import org.slf4j.Logger;
+
+import us.codecraft.webmagic.model.FieldExtractor;
+import us.codecraft.webmagic.model.formatter.ObjectFormatter;
+
+public abstract class PageField {
+ public abstract boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException;
+
+ protected Object convert(String value, ObjectFormatter objectFormatter, Logger logger) {
+ try {
+ Object format = objectFormatter.format(value);
+ logger.debug("String {} is converted to {}", value, format);
+ return format;
+ } catch (Exception e) {
+ logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
+ }
+ return null;
+ }
+
+ protected void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
+ if (value != null) {
+ if (fieldExtractor.getSetterMethod() != null)
+ fieldExtractor.getSetterMethod().invoke(o, value);
+ fieldExtractor.getField().set(o, value);
+ }
+ }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java
new file mode 100644
index 000000000..136a1c56e
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/fields/SingleField.java
@@ -0,0 +1,28 @@
+package us.codecraft.webmagic.model.fields;
+
+import java.lang.reflect.InvocationTargetException;
+
+import org.slf4j.Logger;
+
+import lombok.Getter;
+import us.codecraft.webmagic.model.FieldExtractor;
+
+public class SingleField extends PageField {
+ @Getter
+ private String fieldName;
+
+ public SingleField(String fieldName) {
+ this.fieldName = fieldName;
+ }
+
+ public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException {
+ if (fieldExtractor.getObjectFormatter() != null) {
+ Object converted = this.convert(this.fieldName, fieldExtractor.getObjectFormatter(), logger);
+ if (converted == null && fieldExtractor.isNotNull())
+ return false;
+ setField(o, fieldExtractor, converted);
+ } else
+ setField(o, fieldExtractor, this.fieldName);
+ return true;
+ }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java
new file mode 100644
index 000000000..f03b8864a
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicClassDetector.java
@@ -0,0 +1,85 @@
+package us.codecraft.webmagic.model.formatter;
+
+public interface BasicClassDetector {
+ Class> detectBasicClass(Class> type);
+}
+
+class IntegerClassDetector implements BasicClassDetector {
+ @Override
+ public Class> detectBasicClass(Class> type) {
+ if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
+ return Integer.class;
+ }
+ return null;
+ }
+}
+
+class LongClassDetector implements BasicClassDetector {
+ @Override
+ public Class> detectBasicClass(Class> type) {
+ if (type.equals(Long.TYPE) || type.equals(Long.class)) {
+ return Long.class;
+ }
+ return null;
+ }
+}
+
+class DoubleClassDetector implements BasicClassDetector {
+ @Override
+ public Class> detectBasicClass(Class> type) {
+ if (type.equals(Double.TYPE) || type.equals(Double.class)) {
+ return Double.class;
+ }
+ return null;
+ }
+}
+
+class FloatClassDetector implements BasicClassDetector {
+ @Override
+ public Class> detectBasicClass(Class> type) {
+ if (type.equals(Float.TYPE) || type.equals(Float.class)) {
+ return Float.class;
+ }
+ return null;
+ }
+}
+
+class ShortClassDetector implements BasicClassDetector {
+ @Override
+ public Class> detectBasicClass(Class> type) {
+ if (type.equals(Short.TYPE) || type.equals(Short.class)) {
+ return Short.class;
+ }
+ return null;
+ }
+}
+
+class CharacterClassDetector implements BasicClassDetector {
+ @Override
+ public Class> detectBasicClass(Class> type) {
+ if (type.equals(Character.TYPE) || type.equals(Character.class)) {
+ return Character.class;
+ }
+ return null;
+ }
+}
+
+class ByteClassDetector implements BasicClassDetector {
+ @Override
+ public Class> detectBasicClass(Class> type) {
+ if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
+ return Byte.class;
+ }
+ return null;
+ }
+}
+
+class BooleanClassDetector implements BasicClassDetector {
+ @Override
+ public Class> detectBasicClass(Class> type) {
+ if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
+ return Boolean.class;
+ }
+ return null;
+ }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java
index f9d76a845..2d4d85b0a 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java
@@ -24,28 +24,24 @@ public T format(String raw) throws Exception {
}
protected abstract T formatTrimmed(String raw) throws Exception;
-
public static final List> basicTypeFormatters = Arrays.>asList(IntegerFormatter.class,
LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class,
CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class);
+ public static final List basicClassDetector= Arrays.asList(new IntegerClassDetector(),
+ new LongClassDetector(),
+ new FloatClassDetector(),
+ new DoubleClassDetector(),
+ new ShortClassDetector(),
+ new ByteClassDetector(),
+ new BooleanClassDetector(),
+ new CharacterClassDetector());
public static Class> detectBasicClass(Class> type) {
- if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
- return Integer.class;
- } else if (type.equals(Long.TYPE) || type.equals(Long.class)) {
- return Long.class;
- } else if (type.equals(Double.TYPE) || type.equals(Double.class)) {
- return Double.class;
- } else if (type.equals(Float.TYPE) || type.equals(Float.class)) {
- return Float.class;
- } else if (type.equals(Short.TYPE) || type.equals(Short.class)) {
- return Short.class;
- } else if (type.equals(Character.TYPE) || type.equals(Character.class)) {
- return Character.class;
- } else if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
- return Byte.class;
- } else if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
- return Boolean.class;
+ for (BasicClassDetector detector : basicClassDetector) {
+ Class> detectedClass = detector.detectBasicClass(type);
+ if (detectedClass != null) {
+ return detectedClass;
+ }
}
return type;
}
@@ -146,5 +142,4 @@ public Class clazz() {
}
}
-
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java
new file mode 100644
index 000000000..146827220
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/Source.java
@@ -0,0 +1,68 @@
+package us.codecraft.webmagic.model.sources;
+
+import java.util.List;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.model.FieldExtractor;
+
+public interface Source {
+ public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
+ public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
+
+ public class RawHtml implements Source {
+ public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ return page.getHtml().selectDocument(fieldExtractor.getSelector());
+ }
+
+ public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
+ }
+ }
+
+ public class SelectedHtml implements Source {
+ public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ if (isRaw)
+ return page.getHtml().selectDocument(fieldExtractor.getSelector());
+ else
+ return fieldExtractor.getSelector().select(html);
+ }
+
+ public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ if (isRaw)
+ return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
+ else
+ return fieldExtractor.getSelector().selectList(html);
+ }
+ }
+
+ public class Url implements Source {
+ public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ return fieldExtractor.getSelector().select(page.getUrl().toString());
+ }
+
+ public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ return fieldExtractor.getSelector().selectList(page.getUrl().toString());
+ }
+ }
+
+ public class RawText implements Source {
+ public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ return fieldExtractor.getSelector().select(page.getRawText());
+ }
+
+ public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ return fieldExtractor.getSelector().selectList(page.getRawText());
+ }
+ }
+
+ public class DefaultSource implements Source {
+ public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ return fieldExtractor.getSelector().select(html);
+ }
+
+ public List getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ return fieldExtractor.getSelector().selectList(html);
+ }
+ }
+}
+
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java
new file mode 100644
index 000000000..1e572695f
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/sources/SourceTextExtractor.java
@@ -0,0 +1,17 @@
+package us.codecraft.webmagic.model.sources;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.model.FieldExtractor;
+import us.codecraft.webmagic.model.fields.MultipleField;
+import us.codecraft.webmagic.model.fields.PageField;
+import us.codecraft.webmagic.model.fields.SingleField;
+
+public class SourceTextExtractor {
+ public static PageField getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
+ Source source = fieldExtractor.getSource();
+ if (fieldExtractor.isMulti())
+ return new MultipleField(source.getTextList(page, html, isRaw, fieldExtractor));
+ else
+ return new SingleField(source.getText(page, html, isRaw, fieldExtractor));
+ }
+}
\ No newline at end of file
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java
index cfb4a8200..50dbcaf1a 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java
@@ -1,21 +1,25 @@
package us.codecraft.webmagic.monitor;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import us.codecraft.webmagic.Request;
-import us.codecraft.webmagic.Spider;
-import us.codecraft.webmagic.SpiderListener;
-import us.codecraft.webmagic.utils.Experimental;
-import us.codecraft.webmagic.utils.UrlUtils;
-
-import javax.management.*;
import java.lang.management.ManagementFactory;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
-import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
+import javax.management.InstanceAlreadyExistsException;
+import javax.management.JMException;
+import javax.management.MBeanRegistrationException;
+import javax.management.MBeanServer;
+import javax.management.MalformedObjectNameException;
+import javax.management.NotCompliantMBeanException;
+import javax.management.ObjectName;
+
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.SpiderListener;
+import us.codecraft.webmagic.utils.Experimental;
+import us.codecraft.webmagic.utils.UrlUtils;
+
/**
* @author code4crafer@gmail.com
* @since 0.5.0
@@ -23,17 +27,13 @@
@Experimental
public class SpiderMonitor {
- private static SpiderMonitor INSTANCE = new SpiderMonitor();
-
- private AtomicBoolean started = new AtomicBoolean(false);
-
- private Logger logger = LoggerFactory.getLogger(getClass());
+ private static final SpiderMonitor INSTANCE = new SpiderMonitor();
private MBeanServer mbeanServer;
private String jmxServerName;
- private List spiderStatuses = new ArrayList();
+ private List spiderStatuses = new ArrayList<>();
protected SpiderMonitor() {
jmxServerName = "WebMagic";
@@ -51,7 +51,7 @@ public synchronized SpiderMonitor register(Spider... spiders) throws JMException
for (Spider spider : spiders) {
MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener();
if (spider.getSpiderListeners() == null) {
- List spiderListeners = new ArrayList();
+ List spiderListeners = new ArrayList<>();
spiderListeners.add(monitorSpiderListener);
spider.setSpiderListeners(spiderListeners);
} else {
@@ -68,6 +68,10 @@ protected SpiderStatusMXBean getSpiderStatusMBean(Spider spider, MonitorSpiderLi
return new SpiderStatus(spider, monitorSpiderListener);
}
+ protected List getSpiderStatuses() {
+ return this.spiderStatuses;
+ }
+
public static SpiderMonitor instance() {
return INSTANCE;
}
@@ -86,7 +90,7 @@ public void onSuccess(Request request) {
}
@Override
- public void onError(Request request) {
+ public void onError(Request request, Exception e) {
errorUrls.add(request.getUrl());
errorCount.incrementAndGet();
}
@@ -105,7 +109,6 @@ public List getErrorUrls() {
}
protected void registerMBean(SpiderStatusMXBean spiderStatus) throws MalformedObjectNameException, InstanceAlreadyExistsException, MBeanRegistrationException, NotCompliantMBeanException {
-// ObjectName objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName());
ObjectName objName = new ObjectName(jmxServerName + ":name=" + UrlUtils.removePort(spiderStatus.getName()));
mbeanServer.registerMBean(spiderStatus, objName);
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java
index a87c040bd..69afe042a 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatus.java
@@ -84,8 +84,13 @@ public Date getStartTime() {
@Override
public int getPagePerSecond() {
- int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000;
- return getSuccessPageCount() / runSeconds;
+ if (getStartTime() != null) {
+ int runSeconds = (int) (System.currentTimeMillis() - getStartTime().getTime()) / 1000;
+ if (runSeconds != 0) {
+ return getSuccessPageCount() / runSeconds;
+ }
+ }
+ return -1;
}
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
index fec3c1db9..0dabdd954 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java
@@ -1,29 +1,13 @@
package us.codecraft.webmagic.scheduler;
-import java.io.BufferedReader;
-import java.io.Closeable;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.util.LinkedHashSet;
-import java.util.Set;
-import java.util.concurrent.BlockingQueue;
-import java.util.concurrent.Executors;
-import java.util.concurrent.LinkedBlockingQueue;
-import java.util.concurrent.ScheduledExecutorService;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicBoolean;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
-
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
-import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+
+import java.io.*;
+import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
/**
@@ -32,7 +16,7 @@
* @author code4crafter@gmail.com
* @since 0.2.0
*/
-public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler,Closeable {
+public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, Closeable {
private String filePath = System.getProperty("java.io.tmpdir");
@@ -52,8 +36,6 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement
private BlockingQueue queue;
- private Set urls;
-
private ScheduledExecutorService flushThreadPool;
public FileCacheQueueScheduler(String filePath) {
@@ -83,36 +65,13 @@ private void init(Task task) {
}
private void initDuplicateRemover() {
- setDuplicateRemover(
- new DuplicateRemover() {
- @Override
- public boolean isDuplicate(Request request, Task task) {
- if (!inited.get()) {
- init(task);
- }
- return !urls.add(request.getUrl());
- }
-
- @Override
- public void resetDuplicateCheck(Task task) {
- urls.clear();
- }
-
- @Override
- public int getTotalRequestsCount(Task task) {
- return urls.size();
- }
- });
+ BloomFilterDuplicateRemover bloomFilterDuplicateRemover = new BloomFilterDuplicateRemover(this.filePath.hashCode());
+ setDuplicateRemover(bloomFilterDuplicateRemover);
}
private void initFlushThread() {
- flushThreadPool = Executors.newScheduledThreadPool(1);
- flushThreadPool.scheduleAtFixedRate(new Runnable() {
- @Override
- public void run() {
- flush();
- }
- }, 10, 10, TimeUnit.SECONDS);
+ flushThreadPool = Executors.newScheduledThreadPool(1);
+ flushThreadPool.scheduleAtFixedRate(this::flush, 10, 10, TimeUnit.SECONDS);
}
private void initWriter() {
@@ -127,7 +86,6 @@ private void initWriter() {
private void readFile() {
try {
queue = new LinkedBlockingQueue();
- urls = new LinkedHashSet();
readCursorFile();
readUrlFile();
// initDuplicateRemover();
@@ -140,46 +98,43 @@ private void readFile() {
}
private void readUrlFile() throws IOException {
- String line;
- BufferedReader fileUrlReader = null;
- try {
- fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName)));
+ try (BufferedReader fileUrlReader = new BufferedReader(new FileReader(getFileName(fileUrlAllName)))) {
+ String line;
int lineReaded = 0;
while ((line = fileUrlReader.readLine()) != null) {
- urls.add(line.trim());
+ Request request = deserializeRequest(line);
+ this.getDuplicateRemover().isDuplicate(request, null);
lineReaded++;
if (lineReaded > cursor.get()) {
- queue.add(deserializeRequest(line));
+ queue.add(request);
}
}
- } finally {
- if (fileUrlReader != null) {
- IOUtils.closeQuietly(fileUrlReader);
- }
}
}
private void readCursorFile() throws IOException {
- BufferedReader fileCursorReader = null;
- try {
- fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor)));
+ String fileName = getFileName(fileCursor);
+ try (BufferedReader fileCursorReader = new BufferedReader(new FileReader(fileName))) {
String line;
+ String lastLine = null;
//read the last number
while ((line = fileCursorReader.readLine()) != null) {
- cursor = new AtomicInteger(NumberUtils.toInt(line));
+ line = line.trim();
+ if (!line.isEmpty()) {
+ lastLine = line;
+ }
}
- } finally {
- if (fileCursorReader != null) {
- IOUtils.closeQuietly(fileCursorReader);
+ if (lastLine != null) {
+ cursor.set(NumberUtils.toInt(line));
}
}
}
-
+
public void close() throws IOException {
- flushThreadPool.shutdown();
- fileUrlWriter.close();
- fileCursorWriter.close();
- }
+ flushThreadPool.shutdown();
+ fileUrlWriter.close();
+ fileCursorWriter.close();
+ }
private String getFileName(String filename) {
return filePath + task.getUUID() + filename;
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java
index 540574ad2..7abe5bfad 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisPriorityScheduler.java
@@ -1,22 +1,23 @@
package us.codecraft.webmagic.scheduler;
-import com.alibaba.fastjson.JSON;
+import java.util.Set;
+
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
+
+import com.alibaba.fastjson.JSON;
+
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
-import java.util.Set;
-
/**
* the redis scheduler with priority
* @author sai
* Created by sai on 16-5-27.
*/
-public class RedisPriorityScheduler extends RedisScheduler
-{
+public class RedisPriorityScheduler extends RedisScheduler {
private static final String ZSET_PREFIX = "zset_";
@@ -37,62 +38,44 @@ public RedisPriorityScheduler(JedisPool pool) {
}
@Override
- protected void pushWhenNoDuplicate(Request request, Task task)
- {
- Jedis jedis = pool.getResource();
- try
- {
- if(request.getPriority() > 0)
+ protected void pushWhenNoDuplicate(Request request, Task task) {
+ try (Jedis jedis = pool.getResource()) {
+ if (request.getPriority() > 0) {
jedis.zadd(getZsetPlusPriorityKey(task), request.getPriority(), request.getUrl());
- else if(request.getPriority() < 0)
+ } else if (request.getPriority() < 0) {
jedis.zadd(getZsetMinusPriorityKey(task), request.getPriority(), request.getUrl());
- else
+ } else {
jedis.lpush(getQueueNoPriorityKey(task), request.getUrl());
+ }
setExtrasInItem(jedis, request, task);
}
- finally
- {
- pool.returnResource(jedis);
- }
}
@Override
- public synchronized Request poll(Task task)
- {
- Jedis jedis = pool.getResource();
- try
- {
+ public synchronized Request poll(Task task) {
+ try (Jedis jedis = pool.getResource()) {
String url = getRequest(jedis, task);
- if(StringUtils.isBlank(url))
+ if (StringUtils.isBlank(url)) {
return null;
+ }
return getExtrasInItem(jedis, url, task);
}
- finally
- {
- pool.returnResource(jedis);
- }
}
- private String getRequest(Jedis jedis, Task task)
- {
+ private String getRequest(Jedis jedis, Task task) {
String url;
Set urls = jedis.zrevrange(getZsetPlusPriorityKey(task), 0, 0);
- if(urls.isEmpty())
- {
+ if (urls.isEmpty()) {
url = jedis.lpop(getQueueNoPriorityKey(task));
- if(StringUtils.isBlank(url))
- {
+ if (StringUtils.isBlank(url)) {
urls = jedis.zrevrange(getZsetMinusPriorityKey(task), 0, 0);
- if(!urls.isEmpty())
- {
+ if (!urls.isEmpty()) {
url = urls.toArray(new String[0])[0];
jedis.zrem(getZsetMinusPriorityKey(task), url);
}
}
- }
- else
- {
+ } else {
url = urls.toArray(new String[0])[0];
jedis.zrem(getZsetPlusPriorityKey(task), url);
}
@@ -100,51 +83,39 @@ private String getRequest(Jedis jedis, Task task)
}
@Override
- public void resetDuplicateCheck(Task task)
- {
- Jedis jedis = pool.getResource();
- try
- {
+ public void resetDuplicateCheck(Task task) {
+ try (Jedis jedis = pool.getResource()) {
jedis.del(getSetKey(task));
}
- finally
- {
- pool.returnResource(jedis);
- }
}
- private String getZsetPlusPriorityKey(Task task)
- {
+ private String getZsetPlusPriorityKey(Task task) {
return ZSET_PREFIX + task.getUUID() + PLUS_PRIORITY_SUFFIX;
}
- private String getQueueNoPriorityKey(Task task)
- {
+ private String getQueueNoPriorityKey(Task task) {
return QUEUE_PREFIX + task.getUUID() + NO_PRIORITY_SUFFIX;
}
- private String getZsetMinusPriorityKey(Task task)
- {
+ private String getZsetMinusPriorityKey(Task task) {
return ZSET_PREFIX + task.getUUID() + MINUS_PRIORITY_SUFFIX;
}
- private void setExtrasInItem(Jedis jedis,Request request, Task task)
- {
- if(request.getExtras() != null)
- {
- String field = DigestUtils.shaHex(request.getUrl());
+ private void setExtrasInItem(Jedis jedis,Request request, Task task) {
+ if (!request.getExtras().isEmpty()) {
+ String field = DigestUtils.sha1Hex(request.getUrl());
String value = JSON.toJSONString(request);
jedis.hset(getItemKey(task), field, value);
}
}
- private Request getExtrasInItem(Jedis jedis, String url, Task task)
- {
+ private Request getExtrasInItem(Jedis jedis, String url, Task task) {
String key = getItemKey(task);
- String field = DigestUtils.shaHex(url);
+ String field = DigestUtils.sha1Hex(url);
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
- if(bytes != null)
+ if (bytes != null) {
return JSON.parseObject(new String(bytes), Request.class);
+ }
return new Request(url);
}
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
index c70d88507..8d61bea3b 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
@@ -1,8 +1,10 @@
package us.codecraft.webmagic.scheduler;
-import com.alibaba.fastjson.JSON;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
+
+import com.alibaba.fastjson.JSON;
+
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
@@ -37,21 +39,15 @@ public RedisScheduler(JedisPool pool) {
@Override
public void resetDuplicateCheck(Task task) {
- Jedis jedis = pool.getResource();
- try {
+ try (Jedis jedis = pool.getResource()) {
jedis.del(getSetKey(task));
- } finally {
- pool.returnResource(jedis);
}
}
@Override
public boolean isDuplicate(Request request, Task task) {
- Jedis jedis = pool.getResource();
- try {
+ try (Jedis jedis = pool.getResource()) {
return jedis.sadd(getSetKey(task), request.getUrl()) == 0;
- } finally {
- pool.returnResource(jedis);
}
}
@@ -62,7 +58,7 @@ protected void pushWhenNoDuplicate(Request request, Task task) {
try {
jedis.rpush(getQueueKey(task), request.getUrl());
if (checkForAdditionalInfo(request)) {
- String field = DigestUtils.shaHex(request.getUrl());
+ String field = DigestUtils.sha1Hex(request.getUrl());
String value = JSON.toJSONString(request);
jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
}
@@ -88,7 +84,7 @@ private boolean checkForAdditionalInfo(Request request) {
return true;
}
- if (request.getExtras() != null && !request.getExtras().isEmpty()) {
+ if (!request.getExtras().isEmpty()) {
return true;
}
if (request.getPriority() != 0L) {
@@ -100,14 +96,13 @@ private boolean checkForAdditionalInfo(Request request) {
@Override
public synchronized Request poll(Task task) {
- Jedis jedis = pool.getResource();
- try {
+ try (Jedis jedis = pool.getResource()) {
String url = jedis.lpop(getQueueKey(task));
if (url == null) {
return null;
}
String key = ITEM_PREFIX + task.getUUID();
- String field = DigestUtils.shaHex(url);
+ String field = DigestUtils.sha1Hex(url);
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
if (bytes != null) {
Request o = JSON.parseObject(new String(bytes), Request.class);
@@ -115,8 +110,6 @@ public synchronized Request poll(Task task) {
}
Request request = new Request(url);
return request;
- } finally {
- pool.returnResource(jedis);
}
}
@@ -134,23 +127,17 @@ protected String getItemKey(Task task) {
@Override
public int getLeftRequestsCount(Task task) {
- Jedis jedis = pool.getResource();
- try {
+ try (Jedis jedis = pool.getResource()) {
Long size = jedis.llen(getQueueKey(task));
return size.intValue();
- } finally {
- pool.returnResource(jedis);
}
}
@Override
public int getTotalRequestsCount(Task task) {
- Jedis jedis = pool.getResource();
- try {
+ try (Jedis jedis = pool.getResource()) {
Long size = jedis.scard(getSetKey(task));
return size.intValue();
- } finally {
- pool.returnResource(jedis);
}
}
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java
old mode 100755
new mode 100644
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java
old mode 100755
new mode 100644
diff --git a/webmagic-extension/src/main/resources/log4j.xml b/webmagic-extension/src/main/resources/log4j.xml
deleted file mode 100644
index c2b5a2f53..000000000
--- a/webmagic-extension/src/main/resources/log4j.xml
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java
index 63c40d295..c2081dbf3 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/configurable/ConfigurablePageProcessorTest.java
@@ -13,7 +13,6 @@
/**
* @author code4crafter@gmail.com
- * @date 14-4-5
*/
public class ConfigurablePageProcessorTest {
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
index 91e3698cf..bb18aa2c5 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
@@ -938,6 +938,7 @@ public Page download(Request request, Task task) {
Page page = new Page();
page.setRawText(html);
page.setStatusCode(200);
+ page.setDownloadSuccess(true);
page.setRequest(new Request("https://github.com/code4craft/webmagic"));
page.setUrl(new PlainText("https://github.com/code4craft/webmagic"));
return page;
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java
index 627fa6e84..1014a45f5 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/ModelPageProcessorTest.java
@@ -12,7 +12,6 @@
/**
* @author code4crafter@gmail.com
- * @date 14-4-4
*/
public class ModelPageProcessorTest {
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java
index 4b0c133cb..0451edcfe 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageMocker.java
@@ -1,11 +1,13 @@
package us.codecraft.webmagic.model;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
import org.apache.commons.io.IOUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.selector.PlainText;
-import java.io.IOException;
/**
* @author code4crafter@gmail.com
@@ -16,7 +18,7 @@ public class PageMocker {
public Page getMockJsonPage() throws IOException {
Page page = new Page();
- page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json")));
+ page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("json/mock-githubrepo.json"), Charset.defaultCharset()));
page.setRequest(new Request("https://api.github.com/repos/code4craft/webmagic"));
page.setUrl(new PlainText("https://api.github.com/repos/code4craft/webmagic"));
return page;
@@ -24,7 +26,7 @@ public Page getMockJsonPage() throws IOException {
public Page getMockPage() throws IOException {
Page page = new Page();
- page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html")));
+ page.setRawText(IOUtils.toString(PageMocker.class.getClassLoader().getResourceAsStream("html/mock-webmagic.html"), Charset.defaultCharset()));
page.setRequest(new Request("http://webmagic.io/list/0"));
page.setUrl(new PlainText("http://webmagic.io/list/0"));
return page;
diff --git a/webmagic-extension/src/test/resources/log4j.xml b/webmagic-extension/src/test/resources/log4j.xml
deleted file mode 100644
index c2b5a2f53..000000000
--- a/webmagic-extension/src/test/resources/log4j.xml
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/webmagic-extension/src/test/resources/log4j2-test.xml b/webmagic-extension/src/test/resources/log4j2-test.xml
new file mode 100644
index 000000000..86aee5f59
--- /dev/null
+++ b/webmagic-extension/src/test/resources/log4j2-test.xml
@@ -0,0 +1,16 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml
index 44fee7c0d..50e79c73e 100644
--- a/webmagic-samples/pom.xml
+++ b/webmagic-samples/pom.xml
@@ -1,9 +1,14 @@
-
+
- webmagic-parent
us.codecraft
- 0.7.3
+ webmagic
+ 1.0.3
4.0.0
@@ -21,8 +26,24 @@
${project.version}
- junit
- junit
+ org.mapdb
+ mapdb
+ 3.1.0
+
+
+ com.fasterxml.jackson.core
+ jackson-core
+ 2.15.2
+
+
+ com.fasterxml.jackson.core
+ jackson-annotations
+ 2.15.2
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+ 2.16.0
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java
index 941bdbde8..136e88d9e 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java
@@ -49,7 +49,7 @@ public static void main(String[] args) {
@Override
public String key() {
- return author+":"+name;
+ return author+"_"+name;
}
public String getName() {
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java
new file mode 100644
index 000000000..bee80e775
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/DuplicateStorageRemover.java
@@ -0,0 +1,78 @@
+package us.codecraft.webmagic.recover;
+
+import com.google.common.base.Charsets;
+import com.google.common.hash.BloomFilter;
+import com.google.common.hash.Funnels;
+import org.mapdb.DB;
+import org.mapdb.DBMaker;
+import org.mapdb.IndexTreeList;
+import org.mapdb.Serializer;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * @author :linweisen
+ */
+public class DuplicateStorageRemover implements DuplicateRemover {
+
+ private DB db;
+
+ private static String DATABASE_NAME = "duplicate";
+
+ private IndexTreeList urlDuplicateQueue;
+
+ private BloomFilter bloomFilter;
+
+ private AtomicInteger counter;
+
+ public DuplicateStorageRemover(String path) {
+
+ String duplicatStoragePath = path;
+
+ DB db = DBMaker.fileDB(duplicatStoragePath)
+ .fileMmapEnableIfSupported()
+ .fileMmapPreclearDisable()
+ .cleanerHackEnable()
+ .closeOnJvmShutdown()
+ .transactionEnable()
+ .concurrencyScale(128)
+ .make();
+ this.db = db;
+
+ this.urlDuplicateQueue = db.indexTreeList(DATABASE_NAME, Serializer.STRING).createOrOpen();
+
+ counter = new AtomicInteger(this.urlDuplicateQueue.size());
+ this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7);
+ for (String url : this.urlDuplicateQueue){
+ bloomFilter.put(url);
+ }
+
+ }
+
+ @Override
+ public boolean isDuplicate(Request request, Task task) {
+ String url = request.getUrl();
+ boolean isDuplicate = bloomFilter.mightContain(url);
+ if (!isDuplicate) {
+ bloomFilter.put(url);
+ urlDuplicateQueue.add(url);
+ this.db.commit();
+ counter.incrementAndGet();
+ }
+ return isDuplicate;
+ }
+
+ @Override
+ public void resetDuplicateCheck(Task task) {
+ this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), 200000, 1E-7);
+ this.urlDuplicateQueue.clear();
+ }
+
+ @Override
+ public int getTotalRequestsCount(Task task) {
+ return counter.get();
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java
new file mode 100644
index 000000000..4cee18afd
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/MmapQueueScheduler.java
@@ -0,0 +1,85 @@
+package us.codecraft.webmagic.recover;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.apache.commons.lang3.StringUtils;
+import org.mapdb.DB;
+import org.mapdb.DBMaker;
+import org.mapdb.IndexTreeList;
+import org.mapdb.Serializer;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.scheduler.DuplicateRemovedScheduler;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+
+import java.io.IOException;
+
+/**
+ * @author :linweisen
+ */
+public class MmapQueueScheduler extends DuplicateRemovedScheduler {
+
+ private DB db;
+
+ private static String DATABASE_NAME = "queue";
+
+ private IndexTreeList queue;
+
+ private static ObjectMapper mapper;
+
+ public MmapQueueScheduler(DuplicateRemover duplicateRemover, String path) {
+ super.setDuplicateRemover(duplicateRemover);
+
+ String queuePath = path;
+
+ DB db = DBMaker.fileDB(queuePath)
+ .fileMmapEnableIfSupported()
+ .fileMmapPreclearDisable()
+ .cleanerHackEnable()
+ .closeOnJvmShutdown()
+ .transactionEnable()
+ .concurrencyScale(128)
+ .make();
+ this.db = db;
+ this.mapper = new ObjectMapper();
+ this.queue = db.indexTreeList(MmapQueueScheduler.DATABASE_NAME, Serializer.STRING).createOrOpen();
+ }
+
+ @Override
+ public Request poll(Task task) {
+ if (this.queue.size() > 0){
+ String s = queue.remove(0);
+ return fromJson(s, Request.class);
+ }else{
+ return null;
+ }
+
+ }
+
+ @Override
+ public void pushWhenNoDuplicate(Request request, Task task) {
+ queue.add(toJson(request));
+ this.db.commit();
+ }
+
+ public String toJson(Object object) {
+ try {
+ return mapper.writeValueAsString(object);
+ } catch (IOException e) {
+ logger.warn("write to json string error:" + object, e);
+ return null;
+ }
+ }
+
+ public T fromJson(String jsonString, Class clazz) {
+ if (StringUtils.isEmpty(jsonString)) {
+ return null;
+ }
+ try {
+ return mapper.readValue(jsonString, clazz);
+ } catch (IOException e) {
+ logger.warn("parse json string error:" + jsonString, e);
+ return null;
+ }
+ }
+
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java
new file mode 100644
index 000000000..4fb91a0d2
--- /dev/null
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/recover/RecoverSample.java
@@ -0,0 +1,22 @@
+package us.codecraft.webmagic.recover;
+
+
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.samples.SinaBlogProcessor;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+
+/**
+ * @author code4crafter@gmail.com
+ */
+public class RecoverSample {
+
+ public static void main(String[] args) {
+ String storage = "queue";
+ String duplicate = "duplicate";
+ Spider spider = new Spider(new SinaBlogProcessor());
+ DuplicateRemover remover = new DuplicateStorageRemover(duplicate);
+ spider.setScheduler(new MmapQueueScheduler(remover, storage));
+ spider.addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html")
+ .run();
+ }
+}
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java
index ab560e451..46476bbc8 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java
@@ -1,14 +1,14 @@
package us.codecraft.webmagic.samples;
-import org.apache.commons.collections.CollectionUtils;
+
+import java.util.List;
+import org.apache.commons.collections4.CollectionUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.JsonPathSelector;
-import java.util.List;
-
/**
* @author code4crafter@gmail.com
* @since 0.5.0
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java
index 280f8f186..33dd6aa35 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java
@@ -1,6 +1,6 @@
package us.codecraft.webmagic.samples;
-import org.apache.commons.collections.CollectionUtils;
+import org.apache.commons.collections4.CollectionUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java
index 99d5fa84e..ab5314073 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/PhantomJSPageProcessor.java
@@ -36,7 +36,7 @@ public Site getSite() {
}
public static void main(String[] args) throws Exception {
- PhantomJSDownloader phantomDownloader = new PhantomJSDownloader().setRetryNum(3);
+ PhantomJSDownloader phantomDownloader = new PhantomJSDownloader();
CollectorPipeline collectorPipeline = new ResultItemsCollectorPipeline();
diff --git a/webmagic-samples/src/main/resources/log4j.xml b/webmagic-samples/src/main/resources/log4j.xml
deleted file mode 100644
index a6630f813..000000000
--- a/webmagic-samples/src/main/resources/log4j.xml
+++ /dev/null
@@ -1,26 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/webmagic-samples/src/main/resources/log4j2.xml b/webmagic-samples/src/main/resources/log4j2.xml
new file mode 100644
index 000000000..f3bad53d8
--- /dev/null
+++ b/webmagic-samples/src/main/resources/log4j2.xml
@@ -0,0 +1,19 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml
index da0c5f202..26d1989d6 100644
--- a/webmagic-saxon/pom.xml
+++ b/webmagic-saxon/pom.xml
@@ -1,14 +1,23 @@
-
+
- webmagic-parent
us.codecraft
- 0.7.3
+ webmagic
+ 1.0.3
4.0.0
webmagic-saxon
+
+ true
+
+
${project.groupId}
@@ -23,23 +32,6 @@
net.sf.saxon
Saxon-HE
-
- junit
- junit
-
-
-
-
- org.apache.maven.plugins
- maven-deploy-plugin
- 3.0.0-M1
-
- true
-
-
-
-
-
diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java
new file mode 100644
index 000000000..b03f3a2ab
--- /dev/null
+++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java
@@ -0,0 +1,61 @@
+package us.codecraft.webmagic.selector;
+
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * @author hooy
+ */
+public final class JaxpSelectorUtils {
+
+ private JaxpSelectorUtils() {
+ throw new RuntimeException("The util class cannot be instanced");
+ }
+
+ public static List NodeListToArrayList(NodeList nodes) {
+ List list = new ArrayList<>(nodes.getLength());
+ for (int i = 0; i < nodes.getLength(); i++) {
+ list.add(nodes.item(i));
+ }
+ return list;
+ }
+
+ public static String nodeToString(Node node) throws TransformerException {
+ List before = Collections.singletonList(node);
+ List after = nodesToStrings(before);
+ if (after.size() > 0) {
+ return after.get(0);
+ } else {
+ return null;
+ }
+ }
+
+ public static List nodesToStrings(List nodes) throws TransformerException {
+ List results = new ArrayList<>(nodes.size());
+ Transformer transformer = TransformerFactory.newInstance().newTransformer();
+ StreamResult xmlOutput = new StreamResult();
+ transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
+ for (Node node : nodes) {
+ if (node.getNodeType() == Node.ATTRIBUTE_NODE || node.getNodeType() == Node.TEXT_NODE) {
+ results.add(node.getTextContent());
+ } else {
+ xmlOutput.setWriter(new StringWriter());
+ transformer.transform(new DOMSource(node), xmlOutput);
+ results.add(xmlOutput.getWriter().toString());
+ }
+ }
+ return results;
+ }
+
+}
diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java
new file mode 100644
index 000000000..3e6339dda
--- /dev/null
+++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java
@@ -0,0 +1,32 @@
+package us.codecraft.webmagic.selector;
+
+import org.w3c.dom.Node;
+
+import java.util.List;
+
+/**
+ * Selector(extractor) for html node.
+ *
+ * @author hooy
+ * @since 0.8.0
+ */
+public interface NodeSelector {
+
+ /**
+ * Extract single result in text.
+ * If there are more than one result, only the first will be chosen.
+ *
+ * @param node node
+ * @return result
+ */
+ String select(Node node);
+
+ /**
+ * Extract all results in text.
+ *
+ * @param node node
+ * @return results
+ */
+ List selectList(Node node);
+
+}
diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java
index d8aab6cce..6c5d7b332 100644
--- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java
+++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java
@@ -1,7 +1,14 @@
package us.codecraft.webmagic.selector;
-import net.sf.saxon.lib.NamespaceConstant;
-import net.sf.saxon.xpath.XPathEvaluator;
+import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
+
+import javax.xml.namespace.NamespaceContext;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathExpressionException;
+
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
@@ -12,36 +19,26 @@
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
-import javax.xml.namespace.NamespaceContext;
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.Transformer;
-import javax.xml.transform.TransformerFactory;
-import javax.xml.transform.dom.DOMSource;
-import javax.xml.transform.stream.StreamResult;
-import javax.xml.xpath.XPathConstants;
-import javax.xml.xpath.XPathExpression;
-import javax.xml.xpath.XPathExpressionException;
-import java.io.StringWriter;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
+import net.sf.saxon.lib.NamespaceConstant;
+import net.sf.saxon.xpath.XPathEvaluator;
+import us.codecraft.webmagic.utils.BaseSelectorUtils;
+
+import static us.codecraft.webmagic.selector.JaxpSelectorUtils.*;
/**
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
*
- * @author code4crafter@gmail.com
- * Date: 13-4-21
- * Time: 上午9:39
+ * @author code4crafter@gmail.com, hooy
+ * Date: 13-4-21
+ * Time: 上午9:39
*/
-public class Xpath2Selector implements Selector {
+public class Xpath2Selector implements Selector, NodeSelector {
- private String xpathStr;
+ private final String xpathStr;
private XPathExpression xPathExpression;
- private Logger logger = LoggerFactory.getLogger(getClass());
+ private final Logger logger = LoggerFactory.getLogger(getClass());
public Xpath2Selector(String xpathStr) {
this.xpathStr = xpathStr;
@@ -52,27 +49,28 @@ public Xpath2Selector(String xpathStr) {
}
}
+ public static Xpath2Selector newInstance(String xpathStr) {
+ return new Xpath2Selector(xpathStr);
+ }
+
enum XPath2NamespaceContext implements NamespaceContext {
INSTANCE;
- private final Map prefix2NamespaceMap = new ConcurrentHashMap();
+ private final Map prefix2NamespaceMap = new ConcurrentHashMap<>();
- private final Map> namespace2PrefixMap = new ConcurrentHashMap>();
+ private final Map> namespace2PrefixMap = new ConcurrentHashMap<>();
private void put(String prefix, String namespaceURI) {
prefix2NamespaceMap.put(prefix, namespaceURI);
- List prefixes = namespace2PrefixMap.get(namespaceURI);
- if (prefixes == null) {
- prefixes = new ArrayList();
- namespace2PrefixMap.put(namespaceURI, prefixes);
- }
+ List prefixes = namespace2PrefixMap.computeIfAbsent(namespaceURI, k -> new ArrayList<>());
prefixes.add(prefix);
}
- private XPath2NamespaceContext() {
+ XPath2NamespaceContext() {
put("fn", NamespaceConstant.FN);
put("xslt", NamespaceConstant.XSLT);
+ put("xhtml", NamespaceConstant.XHTML);
}
@Override
@@ -108,32 +106,18 @@ private void init() throws XPathExpressionException {
@Override
public String select(String text) {
try {
- HtmlCleaner htmlCleaner = new HtmlCleaner();
- TagNode tagNode = htmlCleaner.clean(text);
- Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
- Object result;
- try {
- result = xPathExpression.evaluate(document, XPathConstants.NODESET);
- } catch (XPathExpressionException e) {
- result = xPathExpression.evaluate(document, XPathConstants.STRING);
- }
- if (result instanceof NodeList) {
- NodeList nodeList = (NodeList) result;
- if (nodeList.getLength() == 0) {
- return null;
- }
- Node item = nodeList.item(0);
- if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
- return item.getTextContent();
- } else {
- StreamResult xmlOutput = new StreamResult(new StringWriter());
- Transformer transformer = TransformerFactory.newInstance().newTransformer();
- transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
- transformer.transform(new DOMSource(item), xmlOutput);
- return xmlOutput.getWriter().toString();
- }
- }
- return result.toString();
+ Document doc = parse(text);
+ return select(doc);
+ } catch (Exception e) {
+ logger.error("select text error! " + xpathStr, e);
+ }
+ return null;
+ }
+
+ @Override
+ public String select(Node node) {
+ try {
+ return (String) xPathExpression.evaluate(node, XPathConstants.STRING);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
@@ -142,38 +126,72 @@ public String select(String text) {
@Override
public List selectList(String text) {
- List results = new ArrayList();
try {
- HtmlCleaner htmlCleaner = new HtmlCleaner();
- TagNode tagNode = htmlCleaner.clean(text);
- Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
- Object result;
- try {
- result = xPathExpression.evaluate(document, XPathConstants.NODESET);
- } catch (XPathExpressionException e) {
- result = xPathExpression.evaluate(document, XPathConstants.STRING);
- }
- if (result instanceof NodeList) {
- NodeList nodeList = (NodeList) result;
- Transformer transformer = TransformerFactory.newInstance().newTransformer();
- StreamResult xmlOutput = new StreamResult();
- transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
- for (int i = 0; i < nodeList.getLength(); i++) {
- Node item = nodeList.item(i);
- if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
- results.add(item.getTextContent());
- } else {
- xmlOutput.setWriter(new StringWriter());
- transformer.transform(new DOMSource(item), xmlOutput);
- results.add(xmlOutput.getWriter().toString());
- }
- }
- } else {
- results.add(result.toString());
- }
+ Document doc = parse(text);
+ return selectList(doc);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
- return results;
+ return null;
}
+
+ @Override
+ public List selectList(Node node) {
+ try {
+ NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET);
+ List nodes = NodeListToArrayList(result);
+ return nodesToStrings(nodes);
+ } catch (Exception e) {
+ logger.error("select text error! " + xpathStr, e);
+ }
+ return null;
+ }
+
+ public Node selectNode(String text) {
+ try {
+ Document doc = parse(text);
+ return selectNode(doc);
+ } catch (Exception e) {
+ logger.error("select text error! " + xpathStr, e);
+ }
+ return null;
+ }
+
+ public Node selectNode(Node node) {
+ try {
+ return (Node) xPathExpression.evaluate(node, XPathConstants.NODE);
+ } catch (Exception e) {
+ logger.error("select text error! " + xpathStr, e);
+ }
+ return null;
+ }
+
+ public List selectNodes(String text) {
+ try {
+ Document doc = parse(text);
+ return selectNodes(doc);
+ } catch (Exception e) {
+ logger.error("select text error! " + xpathStr, e);
+ }
+ return null;
+ }
+
+ public List selectNodes(Node node) {
+ try {
+ NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET);
+ return NodeListToArrayList(result);
+ } catch (Exception e) {
+ logger.error("select text error! " + xpathStr, e);
+ }
+ return null;
+ }
+
+ protected static Document parse(String text) throws ParserConfigurationException {
+ // HtmlCleaner could not parse or tag directly
+ text = BaseSelectorUtils.preParse(text);
+ HtmlCleaner htmlCleaner = new HtmlCleaner();
+ TagNode tagNode = htmlCleaner.clean(text);
+ return new DomSerializer(new CleanerProperties()).createDOM(tagNode);
+ }
+
}
diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
index aa3765a0c..4033fcfbd 100644
--- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
+++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
@@ -1,5 +1,7 @@
package us.codecraft.webmagic.selector;
+import java.util.List;
+
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
@@ -8,9 +10,16 @@
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
+
+import org.w3c.dom.Node;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.xsoup.XPathEvaluator;
import us.codecraft.xsoup.Xsoup;
+import javax.xml.transform.TransformerException;
+
/**
* @author code4crafter@gmail.com Date: 13-4-21 Time: 上午10:06
*/
@@ -1367,46 +1376,50 @@ public void testOschina() {
public void testXPath2() {
String text = "眉山:扎实推进农业农村工作 促农持续增收 \n" +
"2013-07-31 23:29:45 来源:眉山网 责任编辑:张斯炜 ";
- XpathSelector xpathSelector = new XpathSelector("//h1/text()");
- System.out.println(xpathSelector.select(text));
+ Xpath2Selector xpathSelector = new Xpath2Selector("//h1/text()");
+ Assert.assertEquals("眉山:扎实推进农业农村工作 促农持续增收", xpathSelector.select(text));
}
@Test
public void testXpath2Selector() {
- Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href");
+ Xpath2Selector xpath2Selector = new Xpath2Selector("//xhtml:a/@href");
String select = xpath2Selector.select(html);
- Assert.assertNotNull(select);
+ Assert.assertEquals("http://www.oschina.net/", select);
+
+ List selectList = xpath2Selector.selectList(html);
+ Assert.assertEquals(113, selectList.size());
+ Assert.assertEquals("http://www.oschina.net/", selectList.get(0));
}
@Ignore("take long time")
@Test
public void performanceTest() {
Xpath2Selector xpath2Selector = new Xpath2Selector("//a");
- long time =System.currentTimeMillis();
+ long time = System.currentTimeMillis();
for (int i = 0; i < 1000; i++) {
xpath2Selector.selectList(html);
}
- System.out.println(System.currentTimeMillis()-time);
+ System.out.println(System.currentTimeMillis() - time);
XpathSelector xpathSelector = new XpathSelector("//a");
- time =System.currentTimeMillis();
+ time = System.currentTimeMillis();
for (int i = 0; i < 1000; i++) {
xpathSelector.selectList(html);
}
- System.out.println(System.currentTimeMillis()-time);
+ System.out.println(System.currentTimeMillis() - time);
- time =System.currentTimeMillis();
+ time = System.currentTimeMillis();
for (int i = 0; i < 1000; i++) {
xpath2Selector.selectList(html);
}
System.out.println(System.currentTimeMillis() - time);
CssSelector cssSelector = new CssSelector("a");
- time =System.currentTimeMillis();
+ time = System.currentTimeMillis();
for (int i = 0; i < 1000; i++) {
cssSelector.selectList(html);
}
- System.out.println("css "+(System.currentTimeMillis()-time));
+ System.out.println("css " + (System.currentTimeMillis() - time));
}
@Ignore("take long time")
@@ -1418,55 +1431,92 @@ public void parserPerformanceTest() throws XPatherException {
TagNode tagNode = htmlCleaner.clean(html);
Document document = Jsoup.parse(html);
- long time =System.currentTimeMillis();
+ long time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
htmlCleaner.clean(html);
}
- System.out.println(System.currentTimeMillis()-time);
+ System.out.println(System.currentTimeMillis() - time);
- time =System.currentTimeMillis();
+ time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
tagNode.evaluateXPath("//a");
}
- System.out.println(System.currentTimeMillis()-time);
+ System.out.println(System.currentTimeMillis() - time);
System.out.println("=============");
- time =System.currentTimeMillis();
+ time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
Jsoup.parse(html);
}
- System.out.println(System.currentTimeMillis()-time);
+ System.out.println(System.currentTimeMillis() - time);
- time =System.currentTimeMillis();
+ time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
document.select("a");
}
- System.out.println(System.currentTimeMillis()-time);
+ System.out.println(System.currentTimeMillis() - time);
System.out.println("=============");
- time =System.currentTimeMillis();
+ time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
htmlCleaner.clean(html);
}
- System.out.println(System.currentTimeMillis()-time);
+ System.out.println(System.currentTimeMillis() - time);
- time =System.currentTimeMillis();
+ time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
tagNode.evaluateXPath("//a");
}
- System.out.println(System.currentTimeMillis()-time);
+ System.out.println(System.currentTimeMillis() - time);
System.out.println("=============");
XPathEvaluator compile = Xsoup.compile("//a");
- time =System.currentTimeMillis();
+ time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
compile.evaluate(document);
}
- System.out.println(System.currentTimeMillis()-time);
+ System.out.println(System.currentTimeMillis() - time);
+
+ }
+ /**
+ * New api test
+ *
+ * @author hooy
+ * @since 8.0
+ */
+ private String rank = " 点击榜 排名 分类 书名/最新章节 作者 推荐 更新时间 1. 现实 0
11-24 22:32 2. 架空 1047
03-04 14:44 3. 现实 0
07-20 09:06 4. 豪门 0
12-03 09:12 5. 现实 0
02-01 21:12 6. 玄奇 3455
02-28 12:31 7. 玄奇 20614
03-31 12:37 8. 复仇 55
06-03 11:43 9. 穿越 0
10-27 18:50 10. 宫斗 320
10-31 13:58 11. 宫斗 6268
07-12 20:23 12. 现实 0
01-18 23:00 13. 婚恋 0
12-14 20:50 14. 修真 0
02-03 23:40 15. 豪门 0
11-06 23:38 16. 穿越 191
12-02 23:37 17. 穿越 412
10-13 22:39 18. 豪门 635
07-01 13:15 19. 架空 144
06-18 09:35 20. 宅斗 1032
08-15 19:03 21. 宫斗 0
09-30 20:32 22. 豪门 0
06-05 11:31 23. 重生 80
11-25 19:56 24. 异世 68
01-12 10:06 25. 豪门 0
05-29 18:46 26. 婚恋 2778
11-04 17:48 27. 玄奇 207
12-06 16:57 28. 穿越 260
01-04 23:26 29. 豪门 0
12-07 21:39 30. 架空 1127
06-06 17:28 31. 穿越 113
09-13 09:06 32. 架空 597
02-14 18:47 33. 玄奇 528
06-04 22:04 34. 穿越 328
06-06 22:09 35. 架空 539
05-24 14:42 36. 架空 0
03-05 23:27 37. 穿越 3215
08-21 16:38 38. 宫斗 905
08-04 20:24 39. 玄奇 1328
07-25 10:58 40. 穿越 203
01-27 20:53 41. 宫斗 407
08-31 09:03 42. 宅斗 16
05-03 17:38 43. 豪门 0
11-10 08:00 44. 婚恋 0
07-12 21:37 45. 架空 0
06-23 21:02 46. 玄奇 1382
05-31 20:36 47. 重生 334
07-16 19:19 48. 婚恋 505
11-01 16:42 49. 婚恋 0
10-19 18:32 50. 豪门 540
09-19 19:18 51. 婚恋 226
03-18 13:09 52. 穿越 1026
03-08 16:28 53. 重生 304
02-19 10:25 54. 玄奇 2617
02-15 20:57 55. 穿越 199
09-04 19:43 56. 同人 768
07-19 20:00 57. 宅斗 0
02-13 18:13 58. 豪门 0
11-12 22:23 59. 架空 0
07-28 23:42 60. 婚恋 0
02-03 23:09 61. 豪门 285
01-07 19:21 62. 重生 654
10-12 18:16 63. 异能 617
06-18 20:23 64. 宫斗 27
06-02 21:05 65. 种田 206
08-31 19:23 66. 宅斗 2444
08-19 15:51 67. 宅斗 818
08-07 23:38 68. 现代 0
12-23 17:02 69. 玄奇 0
07-23 12:00 70. 婚恋 0
11-01 16:43 71. 豪门 0
09-12 00:01 72. 架空 0
04-27 22:42 73. 豪门 0
04-19 13:55 74. 异能 62
07-30 00:00 75. 穿越 1307
07-20 16:41 76. 玄奇 12820
07-15 23:46 77. 架空 828
06-06 17:54 78. 宅斗 985
05-20 23:53 79. 玄奇 4960
04-12 15:58 80. 玄奇 245
03-02 23:11 81. 宅斗 34
12-21 10:11 82. 宅斗 1411
07-21 00:00 83. 现代 0
07-31 10:10 84. 玄奇 0
06-18 13:53 85. 架空 0
12-03 23:41 86. 玄奇 0
11-28 22:13 87. 豪门 0
11-07 22:48 88. 婚恋 0
08-29 23:15 89. 种田 1831
08-21 16:38 90. 豪门 0
07-11 21:25 91. 豪门 0
06-13 15:37 92. 豪门 0
05-07 22:10 93. 豪门 0
02-28 00:01 94. 豪门 304
12-16 07:30 95. 婚恋 669
11-07 18:16 96. 仙侠 54
09-25 19:51 97. 豪门 655
08-31 13:02 98. 现实 374
06-29 09:55 99. 穿越 373
06-19 18:07 100. 婚恋 159
06-04 21:05
";
+
+ @Test
+ public void testStringAPI() {
+ // testAPI: selectList(String) -> selectList(Node)
+ List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectList(rank);
+ Assert.assertSame(100, items.size());
+ // testAPI: select(String) -> select(Node)
+ String name = new Xpath2Selector("//td[3]/div/a[1]/text()").select(items.get(10));
+ Assert.assertEquals("深宫安容传", name);
+ }
+
+ @Test
+ public void testNodeAPI() {
+ // testAPI: selectNodes(String) -> selectNodes(Node)
+ List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectNodes(rank);
+ Assert.assertSame(100, items.size());
+ // testAPI: selectNode(Node)
+ Node item = new Xpath2Selector("./td[3]/div/a[1]").selectNode(items.get(10));
+ String name = new Xpath2Selector("./text()").select(item);
+ Assert.assertEquals("深宫安容传", name);
+ }
+
+ @Test
+ public void testUtilAPI() throws TransformerException {
+ Node item = Xpath2Selector.newInstance("//div[@class=\"bd\"]//tbody/tr[11]/td[3]/div/a[1]/text()").selectNode(rank);
+ // testAPI: nodeToString(Node) -> nodesToStrings(List)
+ String name = JaxpSelectorUtils.nodeToString(item);
+ Assert.assertEquals("深宫安容传", name);
}
}
diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml
old mode 100755
new mode 100644
index 9f4219d6c..62cea3e69
--- a/webmagic-scripts/pom.xml
+++ b/webmagic-scripts/pom.xml
@@ -1,18 +1,31 @@
-
+
- webmagic-parent
us.codecraft
- 0.7.3
+ webmagic
+ 1.0.3
4.0.0
webmagic-scripts
- 1.1.2-2
+ 2.1.0
+
+ org.apache.logging.log4j
+ log4j-core
+
+
+ org.apache.logging.log4j
+ log4j-slf4j2-impl
+
org.jruby
jruby
@@ -22,10 +35,6 @@
kotlin-stdlib
${kotlin.version}
-
- org.codehaus.groovy
- groovy-all
-
org.python
jython
@@ -34,25 +43,22 @@
commons-cli
commons-cli
-
- junit
- junit
- test
-
${project.groupId}
webmagic-core
${project.version}
-
- org.slf4j
- slf4j-log4j12
-
${project.groupId}
webmagic-extension
${project.version}
+
+ org.projectlombok
+ lombok
+ 1.18.32
+ provided
+
@@ -94,4 +100,4 @@
-
\ No newline at end of file
+
diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java
new file mode 100644
index 000000000..873176e6e
--- /dev/null
+++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Params.java
@@ -0,0 +1,47 @@
+package us.codecraft.webmagic.scripts;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import lombok.Getter;
+import lombok.Setter;
+import us.codecraft.webmagic.scripts.languages.JRuby;
+import us.codecraft.webmagic.scripts.languages.Javascript;
+import us.codecraft.webmagic.scripts.languages.Language;
+import us.codecraft.webmagic.utils.WMCollections;
+
+public class Params {
+ @Getter
+ Language language = new Javascript();
+
+ @Getter @Setter
+ String scriptFileName;
+
+ @Getter @Setter
+ List urls;
+
+ @Getter @Setter
+ int thread = 1;
+
+ @Getter @Setter
+ int sleepTime = 1000;
+
+ private static Map> alias;
+
+ public Params() {
+ alias = new HashMap>();
+ alias.put(new Javascript(), WMCollections.newHashSet("js", "javascript", "JavaScript", "JS"));
+ alias.put(new JRuby(), WMCollections.newHashSet("ruby", "jruby", "Ruby", "JRuby"));
+ }
+
+ public void setLanguagefromArg(String arg) {
+ for (Map.Entry> languageSetEntry : alias.entrySet()) {
+ if (languageSetEntry.getValue().contains(arg)) {
+ this.language = languageSetEntry.getKey();
+ return;
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java
index 0423e58e1..c60b3ec3d 100755
--- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java
+++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptConsole.java
@@ -1,88 +1,21 @@
package us.codecraft.webmagic.scripts;
import org.apache.commons.cli.*;
-import org.apache.log4j.Level;
-import org.apache.log4j.Logger;
+
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
+import us.codecraft.webmagic.scripts.config.CommandLineOption;
import us.codecraft.webmagic.utils.WMCollections;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
-import java.util.Set;
/**
- * @author code4crafter@gmail.com
+ * @author code4crafter@gmail.com / FrancoisGib
* @since 0.4.1
*/
public class ScriptConsole {
-
- private static class Params {
- Language language = Language.JavaScript;
- String scriptFileName;
- List urls;
- int thread = 1;
- int sleepTime = 1000;
- private static Map> alias = new HashMap>();
-
- static {
- alias.put(Language.JavaScript, WMCollections.newHashSet("js", "javascript", "JavaScript", "JS"));
- alias.put(Language.JRuby, WMCollections.newHashSet("ruby", "jruby", "Ruby", "JRuby"));
- }
-
- public void setLanguagefromArg(String arg) {
- for (Map.Entry> languageSetEntry : alias.entrySet()) {
- if (languageSetEntry.getValue().contains(arg)) {
- this.language = languageSetEntry.getKey();
- return;
- }
- }
- }
-
- private Language getLanguage() {
- return language;
- }
-
- private void setLanguage(Language language) {
- this.language = language;
- }
-
- private String getScriptFileName() {
- return scriptFileName;
- }
-
- private void setScriptFileName(String scriptFileName) {
- this.scriptFileName = scriptFileName;
- }
-
- private List getUrls() {
- return urls;
- }
-
- private void setUrls(List urls) {
- this.urls = urls;
- }
-
- private int getThread() {
- return thread;
- }
-
- private void setThread(int thread) {
- this.thread = thread;
- }
-
- private int getSleepTime() {
- return sleepTime;
- }
-
- private void setSleepTime(int sleepTime) {
- this.sleepTime = sleepTime;
- }
- }
-
public static void main(String[] args) {
Params params = parseCommand(args);
startSpider(params);
@@ -140,45 +73,9 @@ private static void exit() {
private static Params readOptions(CommandLine commandLine) {
Params params = new Params();
- if (commandLine.hasOption("l")) {
- String language = commandLine.getOptionValue("l");
- params.setLanguagefromArg(language);
- }
- if (commandLine.hasOption("f")) {
- String scriptFilename = commandLine.getOptionValue("f");
- params.setScriptFileName(scriptFilename);
- } else {
- exit();
- }
- if (commandLine.hasOption("s")) {
- Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s"));
- params.setSleepTime(sleepTime);
- }
- if (commandLine.hasOption("t")) {
- Integer thread = Integer.parseInt(commandLine.getOptionValue("t"));
- params.setThread(thread);
- }
- if (commandLine.hasOption("g")) {
- configLogger(commandLine.getOptionValue("g"));
- }
- params.setUrls(commandLine.getArgList());
+ List options = CommandLineOption.getAllOptions();
+ for (CommandLineOption option : options)
+ option.addParamOptionIfInCommandLine(params, commandLine);
return params;
}
-
- private static void configLogger(String value) {
- Logger rootLogger = Logger.getRootLogger();
- if ("debug".equalsIgnoreCase(value)) {
- rootLogger.setLevel(Level.DEBUG);
- } else if ("info".equalsIgnoreCase(value)) {
- rootLogger.setLevel(Level.INFO);
- } else if ("warn".equalsIgnoreCase(value)) {
- rootLogger.setLevel(Level.WARN);
- } else if ("trace".equalsIgnoreCase(value)) {
- rootLogger.setLevel(Level.TRACE);
- } else if ("off".equalsIgnoreCase(value)) {
- rootLogger.setLevel(Level.OFF);
- } else if ("error".equalsIgnoreCase(value)) {
- rootLogger.setLevel(Level.ERROR);
- }
- }
-}
+}
\ No newline at end of file
diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java
index d1e5d7fe8..bdfbbaedb 100755
--- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java
+++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java
@@ -2,6 +2,9 @@
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
+
+import us.codecraft.webmagic.scripts.languages.Language;
+
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
@@ -11,14 +14,11 @@
*/
public class ScriptEnginePool {
- private final int size;
-
private final AtomicInteger availableCount;
private final LinkedBlockingQueue scriptEngines = new LinkedBlockingQueue();
public ScriptEnginePool(Language language,int size) {
- this.size = size;
this.availableCount = new AtomicInteger(size);
for (int i=0;i getAllOptions() {
+ return List.of(new OptionL(), new OptionF(), new OptionS(), new OptionT(), new OptionG());
+ }
+}
+
+class OptionL extends CommandLineOption {
+ public OptionL() {
+ super('l');
+ }
+
+ protected void addParamOption(Params params, CommandLine commandLine) {
+ String language = commandLine.getOptionValue("l");
+ params.setLanguagefromArg(language);
+ }
+}
+
+class OptionF extends CommandLineOption {
+ public OptionF() {
+ super('f');
+ }
+
+ protected void addParamOption(Params params, CommandLine commandLine) {
+ String scriptFilename = commandLine.getOptionValue("f");
+ params.setScriptFileName(scriptFilename);
+ }
+}
+
+class OptionS extends CommandLineOption {
+ public OptionS() {
+ super('s');
+ }
+
+ protected void addParamOption(Params params, CommandLine commandLine) {
+ Integer sleepTime = Integer.parseInt(commandLine.getOptionValue("s"));
+ params.setSleepTime(sleepTime);
+ }
+}
+
+class OptionT extends CommandLineOption {
+ public OptionT() {
+ super('t');
+ }
+
+ protected void addParamOption(Params params, CommandLine commandLine) {
+ Integer thread = Integer.parseInt(commandLine.getOptionValue("t"));
+ params.setThread(thread);
+ }
+}
+
+class OptionG extends CommandLineOption {
+ public OptionG() {
+ super('g');
+ }
+
+ protected void addParamOption(Params params, CommandLine commandLine) {
+ ConfigLogger.configLogger(commandLine.getOptionValue("g"));
+ }
+}
\ No newline at end of file
diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java
new file mode 100644
index 000000000..9e81ea6c7
--- /dev/null
+++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/config/ConfigLogger.java
@@ -0,0 +1,34 @@
+package us.codecraft.webmagic.scripts.config;
+
+import java.util.List;
+
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.logging.log4j.Level;
+import org.apache.logging.log4j.core.Logger;
+import org.slf4j.LoggerFactory;
+
+public class ConfigLogger {
+ /**
+ * Log the config parameter. If the counter is less than the number of available
+ * options then it means that the user entered an option
+ *
+ * @param value The config string
+ */
+ public static void configLogger(String value) {
+ List> options = List.of(
+ Pair.of("debug", Level.DEBUG),
+ Pair.of("info", Level.INFO),
+ Pair.of("warn", Level.WARN),
+ Pair.of("trace", Level.TRACE),
+ Pair.of("off", Level.OFF),
+ Pair.of("error", Level.ERROR));
+ Pair option = options.get(0);
+ int i = 1;
+ while (i < options.size() && !option.getLeft().equalsIgnoreCase(value))
+ option = options.get(i++);
+ if (i < options.size()) {
+ Logger rootLogger = (Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME);
+ rootLogger.setLevel(option.getRight());
+ }
+ }
+}
diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java
new file mode 100644
index 000000000..b3a3209a5
--- /dev/null
+++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/JRuby.java
@@ -0,0 +1,26 @@
+package us.codecraft.webmagic.scripts.languages;
+
+import java.util.Iterator;
+import java.util.Map;
+
+import javax.script.ScriptEngine;
+import javax.script.ScriptException;
+
+import org.jruby.RubyHash;
+
+import us.codecraft.webmagic.Page;
+
+public class JRuby extends Language {
+ public JRuby() {
+ super("jruby","ruby/defines.rb","");
+ }
+
+ public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException {
+ RubyHash oRuby = (RubyHash) engine.eval(defines + "\n" + script, engine.getContext());
+ Iterator itruby = oRuby.entrySet().iterator();
+ while (itruby.hasNext()) {
+ Map.Entry pairs = (Map.Entry) itruby.next();
+ page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
+ }
+ }
+}
\ No newline at end of file
diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java
new file mode 100644
index 000000000..b0f7b647a
--- /dev/null
+++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Javascript.java
@@ -0,0 +1,16 @@
+package us.codecraft.webmagic.scripts.languages;
+
+import javax.script.ScriptEngine;
+import javax.script.ScriptException;
+
+import us.codecraft.webmagic.Page;
+
+public class Javascript extends Language {
+ public Javascript() {
+ super("javascript","js/defines.js","");
+ }
+
+ public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException {
+ engine.eval(defines + "\n" + script, engine.getContext());
+ }
+}
\ No newline at end of file
diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java
new file mode 100644
index 000000000..9124d2dbb
--- /dev/null
+++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Jython.java
@@ -0,0 +1,27 @@
+package us.codecraft.webmagic.scripts.languages;
+
+import java.util.Iterator;
+import java.util.Map;
+
+import javax.script.ScriptEngine;
+import javax.script.ScriptException;
+
+import org.python.core.PyDictionary;
+
+import us.codecraft.webmagic.Page;
+
+public class Jython extends Language {
+ public Jython() {
+ super("jython","python/defines.py","");
+ }
+
+ public void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException {
+ engine.eval(defines + "\n" + script, engine.getContext());
+ PyDictionary oJython = (PyDictionary) engine.get("result");
+ Iterator it = oJython.entrySet().iterator();
+ while (it.hasNext()) {
+ Map.Entry pairs = (Map.Entry) it.next();
+ page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
+ }
+ }
+}
\ No newline at end of file
diff --git a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java
old mode 100755
new mode 100644
similarity index 51%
rename from webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java
rename to webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java
index 2f9d22d57..44e6ba0a0
--- a/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java
+++ b/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/languages/Language.java
@@ -1,15 +1,18 @@
-package us.codecraft.webmagic.scripts;
+package us.codecraft.webmagic.scripts.languages;
+
+import javax.script.ScriptEngine;
+import javax.script.ScriptException;
+import us.codecraft.webmagic.Page;
/**
- * @author code4crafter@gmail.com
+ * @author FrancoisGib
*/
-public enum Language {
-
- JavaScript("javascript","js/defines.js",""),
-
- JRuby("jruby","ruby/defines.rb",""),
-
- Jython("jython","python/defines.py","");
+public abstract class Language {
+ public Language(String engineName, String defineFile, String gatherFile) {
+ this.engineName = engineName;
+ this.defineFile = defineFile;
+ this.gatherFile = gatherFile;
+ }
private String engineName;
@@ -17,12 +20,6 @@ public enum Language {
private String gatherFile;
- Language(String engineName, String defineFile, String gatherFile) {
- this.engineName = engineName;
- this.defineFile = defineFile;
- this.gatherFile = gatherFile;
- }
-
public String getEngineName() {
return engineName;
}
@@ -34,4 +31,6 @@ public String getDefineFile() {
public String getGatherFile() {
return gatherFile;
}
+
+ public abstract void process(ScriptEngine engine, String defines, String script, Page page) throws ScriptException;
}
diff --git a/webmagic-scripts/src/main/resources/log4j.xml b/webmagic-scripts/src/main/resources/log4j.xml
deleted file mode 100755
index 474269cb1..000000000
--- a/webmagic-scripts/src/main/resources/log4j.xml
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java
index ffeb9c993..b4c28521f 100755
--- a/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java
+++ b/webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java
@@ -2,7 +2,11 @@
import org.junit.Ignore;
import org.junit.Test;
+
import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.scripts.languages.JRuby;
+import us.codecraft.webmagic.scripts.languages.Javascript;
+import us.codecraft.webmagic.scripts.languages.Jython;
/**
* @author code4crafter@gmail.com
@@ -13,14 +17,14 @@ public class ScriptProcessorTest {
@Test
public void testJavaScriptProcessor() {
- ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JavaScript).scriptFromClassPathFile("js/oschina.js").build();
+ ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Javascript()).scriptFromClassPathFile("js/oschina.js").build();
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}
@Test
public void testRubyProcessor() {
- ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JRuby).scriptFromClassPathFile("ruby/oschina.rb").build();
+ ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new JRuby()).scriptFromClassPathFile("ruby/oschina.rb").build();
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}
@@ -28,7 +32,7 @@ public void testRubyProcessor() {
@Test
public void testPythonProcessor() {
- ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.Jython).scriptFromClassPathFile("python/oschina.py").build();
+ ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(new Jython()).scriptFromClassPathFile("python/oschina.py").build();
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}
diff --git a/webmagic-scripts/src/test/resouces/log4j.xml b/webmagic-scripts/src/test/resouces/log4j.xml
deleted file mode 100755
index 1f64d8dad..000000000
--- a/webmagic-scripts/src/test/resouces/log4j.xml
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/webmagic-scripts/src/test/resources/log4j2-test.xml b/webmagic-scripts/src/test/resources/log4j2-test.xml
new file mode 100644
index 000000000..e2fab6602
--- /dev/null
+++ b/webmagic-scripts/src/test/resources/log4j2-test.xml
@@ -0,0 +1,16 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml
index dfc4a1958..16214c61a 100644
--- a/webmagic-selenium/pom.xml
+++ b/webmagic-selenium/pom.xml
@@ -1,45 +1,46 @@
-
-
- webmagic-parent
- us.codecraft
- 0.7.3
-
- 4.0.0
+
+
+ us.codecraft
+ webmagic
+ 1.0.3
+
+ 4.0.0
- webmagic-selenium
+ webmagic-selenium
-
-
- org.seleniumhq.selenium
- selenium-java
-
-
- ${project.groupId}
- webmagic-core
- ${project.version}
-
-
- com.github.detro
- phantomjsdriver
-
-
- junit
- junit
-
-
+
+
+ org.seleniumhq.selenium
+ selenium-java
+
+
+ ${project.groupId}
+ webmagic-core
+ ${project.version}
+
+
+ com.github.detro
+ phantomjsdriver
+
+
-
-
-
- org.apache.maven.plugins
- maven-deploy-plugin
- 3.0.0-M1
-
- true
-
-
-
-
+
+
+
+ org.apache.maven.plugins
+ maven-deploy-plugin
+ 3.0.0-M1
+
+ true
+
+
+
+
diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
index cce293fc9..f6d2574fb 100644
--- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
+++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
@@ -11,12 +11,14 @@
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
-import us.codecraft.webmagic.downloader.Downloader;
+import us.codecraft.webmagic.downloader.AbstractDownloader;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
+import us.codecraft.webmagic.utils.HttpConstant;
import java.io.Closeable;
import java.io.IOException;
+import java.net.http.HttpRequest;
import java.util.Map;
/**
@@ -24,112 +26,121 @@
* 需要下载Selenium driver支持。
*
* @author code4crafter@gmail.com
- * Date: 13-7-26
- * Time: 下午1:37
+ * Date: 13-7-26
+ * Time: 下午1:37
*/
-public class SeleniumDownloader implements Downloader, Closeable {
-
- private volatile WebDriverPool webDriverPool;
-
- private Logger logger = LoggerFactory.getLogger(getClass());
-
- private int sleepTime = 0;
-
- private int poolSize = 1;
-
- private static final String DRIVER_PHANTOMJS = "phantomjs";
-
- /**
- * 新建
- *
- * @param chromeDriverPath chromeDriverPath
- */
- public SeleniumDownloader(String chromeDriverPath) {
- System.getProperties().setProperty("webdriver.chrome.driver",
- chromeDriverPath);
- }
-
- /**
- * Constructor without any filed. Construct PhantomJS browser
- *
- * @author bob.li.0718@gmail.com
- */
- public SeleniumDownloader() {
- // System.setProperty("phantomjs.binary.path",
- // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
- }
-
- /**
- * set sleep time to wait until load success
- *
- * @param sleepTime sleepTime
- * @return this
- */
- public SeleniumDownloader setSleepTime(int sleepTime) {
- this.sleepTime = sleepTime;
- return this;
- }
-
- @Override
- public Page download(Request request, Task task) {
- checkInit();
- WebDriver webDriver;
- try {
- webDriver = webDriverPool.get();
- } catch (InterruptedException e) {
- logger.warn("interrupted", e);
- return null;
- }
- logger.info("downloading page " + request.getUrl());
- webDriver.get(request.getUrl());
- try {
- Thread.sleep(sleepTime);
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- WebDriver.Options manage = webDriver.manage();
- Site site = task.getSite();
- if (site.getCookies() != null) {
- for (Map.Entry cookieEntry : site.getCookies()
- .entrySet()) {
- Cookie cookie = new Cookie(cookieEntry.getKey(),
- cookieEntry.getValue());
- manage.addCookie(cookie);
- }
- }
-
- /*
- * TODO You can add mouse event or other processes
- *
- * @author: bob.li.0718@gmail.com
- */
-
- WebElement webElement = webDriver.findElement(By.xpath("/html"));
- String content = webElement.getAttribute("outerHTML");
- Page page = new Page();
- page.setRawText(content);
- page.setHtml(new Html(content, request.getUrl()));
- page.setUrl(new PlainText(request.getUrl()));
- page.setRequest(request);
- webDriverPool.returnToPool(webDriver);
- return page;
- }
-
- private void checkInit() {
- if (webDriverPool == null) {
- synchronized (this) {
- webDriverPool = new WebDriverPool(poolSize);
- }
- }
- }
-
- @Override
- public void setThread(int thread) {
- this.poolSize = thread;
- }
-
- @Override
- public void close() throws IOException {
- webDriverPool.closeAll();
- }
+public class SeleniumDownloader extends AbstractDownloader implements Closeable {
+
+ private volatile WebDriverPool webDriverPool;
+
+ private Logger logger = LoggerFactory.getLogger(getClass());
+
+ private int sleepTime = 0;
+
+ private int poolSize = 1;
+
+ private static final String DRIVER_PHANTOMJS = "phantomjs";
+
+ /**
+ * 新建
+ *
+ * @param chromeDriverPath chromeDriverPath
+ */
+ public SeleniumDownloader(String chromeDriverPath) {
+ System.getProperties().setProperty("webdriver.chrome.driver",
+ chromeDriverPath);
+ }
+
+ /**
+ * Constructor without any filed. Construct PhantomJS browser
+ *
+ * @author bob.li.0718@gmail.com
+ */
+ public SeleniumDownloader() {
+ // System.setProperty("phantomjs.binary.path",
+ // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
+ }
+
+ /**
+ * set sleep time to wait until load success
+ *
+ * @param sleepTime sleepTime
+ * @return this
+ */
+ public SeleniumDownloader setSleepTime(int sleepTime) {
+ this.sleepTime = sleepTime;
+ return this;
+ }
+
+ @Override
+ public Page download(Request request, Task task) {
+ checkInit();
+ WebDriver webDriver = null;
+ Page page = Page.fail(request);
+ try {
+ webDriver = webDriverPool.get();
+
+ logger.info("downloading page " + request.getUrl());
+ webDriver.get(request.getUrl());
+ try {
+ if (sleepTime > 0) {
+ Thread.sleep(sleepTime);
+ }
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ WebDriver.Options manage = webDriver.manage();
+ Site site = task.getSite();
+ if (site.getCookies() != null) {
+ for (Map.Entry cookieEntry : site.getCookies()
+ .entrySet()) {
+ Cookie cookie = new Cookie(cookieEntry.getKey(),
+ cookieEntry.getValue());
+ manage.addCookie(cookie);
+ }
+ }
+
+ /*
+ * TODO You can add mouse event or other processes
+ *
+ * @author: bob.li.0718@gmail.com
+ */
+
+ WebElement webElement = webDriver.findElement(By.xpath("/html"));
+ String content = webElement.getAttribute("outerHTML");
+ page.setDownloadSuccess(true);
+ page.setRawText(content);
+ page.setHtml(new Html(content, request.getUrl()));
+ page.setUrl(new PlainText(request.getUrl()));
+ page.setRequest(request);
+ page.setStatusCode(HttpConstant.StatusCode.CODE_200);
+ onSuccess(page, task);
+ } catch (Exception e) {
+ logger.warn("download page {} error", request.getUrl(), e);
+ onError(page, task, e);
+ } finally {
+ if (webDriver != null) {
+ webDriverPool.returnToPool(webDriver);
+ }
+ }
+ return page;
+ }
+
+ private void checkInit() {
+ if (webDriverPool == null) {
+ synchronized (this) {
+ webDriverPool = new WebDriverPool(poolSize);
+ }
+ }
+ }
+
+ @Override
+ public void setThread(int thread) {
+ this.poolSize = thread;
+ }
+
+ @Override
+ public void close() throws IOException {
+ webDriverPool.closeAll();
+ }
}
diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java
index e1d9dd039..b96d2894b 100644
--- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java
+++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java
@@ -1,15 +1,5 @@
package us.codecraft.webmagic.downloader.selenium;
-import org.openqa.selenium.WebDriver;
-import org.openqa.selenium.chrome.ChromeDriver;
-import org.openqa.selenium.firefox.FirefoxDriver;
-import org.openqa.selenium.phantomjs.PhantomJSDriver;
-import org.openqa.selenium.phantomjs.PhantomJSDriverService;
-import org.openqa.selenium.remote.DesiredCapabilities;
-import org.openqa.selenium.remote.RemoteWebDriver;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import java.io.FileReader;
import java.io.IOException;
import java.net.MalformedURLException;
@@ -22,6 +12,18 @@
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.atomic.AtomicInteger;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.chrome.ChromeDriver;
+import org.openqa.selenium.chrome.ChromeOptions;
+import org.openqa.selenium.firefox.FirefoxDriver;
+import org.openqa.selenium.firefox.FirefoxOptions;
+import org.openqa.selenium.phantomjs.PhantomJSDriver;
+import org.openqa.selenium.phantomjs.PhantomJSDriverService;
+import org.openqa.selenium.remote.DesiredCapabilities;
+import org.openqa.selenium.remote.RemoteWebDriver;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
/**
* @author code4crafter@gmail.com
* Date: 13-7-26
@@ -58,7 +60,7 @@ class WebDriverPool {
* Configure the GhostDriver, and initialize a WebDriver instance. This part
* of code comes from GhostDriver.
* https://github.com/detro/ghostdriver/tree/master/test/java/src/test/java/ghostdriver
- *
+ *
* @author bob.li.0718@gmail.com
* @throws IOException
*/
@@ -73,7 +75,6 @@ public void configure() throws IOException {
// Prepare capabilities
sCaps = new DesiredCapabilities();
- sCaps.setJavascriptEnabled(true);
sCaps.setCapability("takesScreenshot", false);
String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS);
@@ -134,9 +135,9 @@ public void configure() throws IOException {
sCaps.setBrowserName("phantomjs");
mDriver = new RemoteWebDriver(new URL(driver), sCaps);
} else if (driver.equals(DRIVER_FIREFOX)) {
- mDriver = new FirefoxDriver(sCaps);
+ mDriver = new FirefoxDriver(new FirefoxOptions(sCaps));
} else if (driver.equals(DRIVER_CHROME)) {
- mDriver = new ChromeDriver(sCaps);
+ mDriver = new ChromeDriver(new ChromeOptions().merge(sCaps));
} else if (driver.equals(DRIVER_PHANTOMJS)) {
mDriver = new PhantomJSDriver(sCaps);
}
@@ -144,7 +145,7 @@ public void configure() throws IOException {
/**
* check whether input is a valid URL
- *
+ *
* @author bob.li.0718@gmail.com
* @param urlString urlString
* @return true means yes, otherwise no.
@@ -178,7 +179,7 @@ public WebDriverPool() {
}
/**
- *
+ *
* @return
* @throws InterruptedException
*/
diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java
index b7bcd80b3..43ac84b5a 100644
--- a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java
+++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java
@@ -1,17 +1,18 @@
package us.codecraft.webmagic.downloader;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+
import org.junit.Ignore;
import org.junit.Test;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
+import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.DesiredCapabilities;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Map;
-
/**
* @author code4crafter@gmail.com
* Date: 13-7-26
@@ -29,10 +30,10 @@ public void testSelenium() {
Map preferences = new HashMap();
preferences.put("profile.default_content_settings", contentSettings);
- DesiredCapabilities caps = DesiredCapabilities.chrome();
+ DesiredCapabilities caps = new DesiredCapabilities();
caps.setCapability("chrome.prefs", preferences);
caps.setCapability("chrome.switches", Arrays.asList("--user-data-dir=/Users/yihua/temp/chrome"));
- WebDriver webDriver = new ChromeDriver(caps);
+ WebDriver webDriver = new ChromeDriver(new ChromeOptions().merge(caps));
webDriver.get("http://huaban.com/");
WebElement webElement = webDriver.findElement(By.xpath("/html"));
System.out.println(webElement.getAttribute("outerHTML"));