result = new ArrayList<>();
+ int i = 0;
+ int j = 0;
+ while (i < a.length && j < b.length) {
+ int cmp = a[i].compareTo(b[j]);
+ if (cmp < 0) {
+ result.add(a[i]);
+ i++;
+ } else if (cmp > 0) {
+ j++;
+ } else {
+ i++;
+ j++;
+ }
+ }
+ while (i < a.length) {
+ result.add(a[i]);
+ i++;
+ }
+ return result;
+
+ }
+
+ /**
+ * Attempts to infer the method and requestBody extra fields by comparing an old urlkey against the original url.
+ *
+ * When run with the --post-append option, webrecorder/cdxj-indexer will include the request method and encoded
+ * version of the request body as query parameters in the url key. In CDXJ output mode it will usually also
+ * add extra "method" and "requestBody" fields for these values. However, in CDX11 output mode there no extra
+ * fields. Older versions of cdxj-indexer in CDXJ mode also didn't populate the extra fields.
+ *
+ * We can determine the fields from request body because they won't appear in the original url query string.
+ *
+ * This method does nothing if the extra fields are already populated.
+ */
+ private void inferMethodAndRequestBodyFromOldUrlKey(String oldUrlKey, UrlCanonicalizer canonicalizer) {
+ if (oldUrlKey == null) return;
+ if (!oldUrlKey.contains("__wb_method=")) return;
+ if (extra != null) {
+ if (extra.containsKey("method")) return;
+ if (extra.containsKey("requestBody")) return;
+ }
+
+ // if the old urlkey contains __wb_method but we don't have method and requestBody,
+ // then we try to extract them from the urlkey by looking for query parameters that appear
+ // in the old urlkey but aren't present in the original url field.
+ String[] oldParams = extractQueryParams(oldUrlKey);
+ String newUrlKey = canonicalizer.surtCanonicalize(original);
+ String[] newParams = extractQueryParams(newUrlKey);
+ List extraParams = diffParams(oldParams, newParams);
+ StringBuilder builder = new StringBuilder();
+ for (String param: extraParams) {
+ if (param.startsWith("__wb_method=")) {
+ put("method", param.substring("__wb_method=".length()).toUpperCase(Locale.ROOT));
+ } else {
+ // probably a request body parameter
+ if (builder.length() > 0) {
+ builder.append("&");
+ }
+ builder.append(param);
+ }
+ }
+ if (builder.length() > 0) {
+ put("requestBody", builder.toString());
+ }
+ }
+
+ private String generateUrlKey(UrlCanonicalizer canonicalizer) {
+ String method = getExtraString("method");
+ String requestBody = getExtraString("requestBody");
+ String url;
+ if ("POST".equalsIgnoreCase(method) || "PUT".equalsIgnoreCase(method)) {
+ StringBuilder builder = new StringBuilder(original);
+ builder.append(original.contains("?") ? "&" : "?");
+ builder.append("__wb_method=").append(method);
+ if (requestBody != null && !requestBody.isEmpty()) {
+ builder.append("&").append(requestBody);
+ }
+ url = builder.toString();
+ } else {
+ url = original;
+ }
+ return canonicalizer.surtCanonicalize(url);
+ }
+
/**
* Convert a 14 digit CDX timestamp into a 64 bit integer (long). If the supplied string is too short, 0 will be
* appended to pad it out. If the supplied string is to long, an exception will be thrown.
diff --git a/test/outbackcdx/CaptureBenchmark.java b/test/outbackcdx/CaptureBenchmark.java
new file mode 100644
index 0000000..f3874a4
--- /dev/null
+++ b/test/outbackcdx/CaptureBenchmark.java
@@ -0,0 +1,78 @@
+package outbackcdx;
+
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.io.IOException;
+
+public class CaptureBenchmark {
+ @State(Scope.Benchmark)
+ public static class MyState {
+ UrlCanonicalizer canonicalizer = new UrlCanonicalizer();
+ Capture capture = Capture.fromCdxLine("- 19870102030405 http://example.org/ text/html 200 sha1:M5ORM4XQ5QCEZEDRNZRGSWXPCOGUVASI - 100 test.warc.gz", canonicalizer);
+ Capture capture2 = Capture.fromCdxLine("- 20210203115119 {\"url\": \"https://example.org/robots.txt\", " +
+ "\"mime\": \"unk\", \"status\": \"400\", \"digest\": \"3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\", " +
+ "\"length\": \"451\", \"offset\": \"90493\", \"filename\": \"example.warc.gz\", \"method\": \"POST\", \"requestBody\": \"x=1&y=2\"}",
+ canonicalizer);
+ byte[] keyV3 = capture2.encodeKey(3);
+ byte[] keyV5 = capture2.encodeKey(5);
+ byte[] valueV3 = capture2.encodeValue(3);
+ byte[] valueV5 = capture2.encodeValue(5);
+ }
+
+ @Benchmark
+ public Capture parseCdx(MyState state) {
+ return Capture.fromCdxLine("org,example)/ 19870102030405 http://example.org/ text/html 200 sha1:M5ORM4XQ5QCEZEDRNZRGSWXPCOGUVASI - 100 test.warc.gz", state.canonicalizer);
+ }
+
+ @Benchmark
+ public Capture parseCdxInfer(MyState state) {
+ return Capture.fromCdxLine("org,example)/?__wb_method=post&__wb_post_data=dGVzdAo= 19870102030405 http://example.org/ text/html 200 sha1:M5ORM4XQ5QCEZEDRNZRGSWXPCOGUVASI - 100 test.warc.gz", state.canonicalizer);
+ }
+
+ @Benchmark
+ public Capture parseCdxj(MyState state) {
+ return Capture.fromCdxLine("org,example)/ 20210203115119 {\"url\": \"https://example.org/\", " +
+ "\"mime\": \"unk\", \"status\": \"400\", \"digest\": \"3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\", " +
+ "\"length\": \"451\", \"offset\": \"90493\", \"filename\": \"example.warc.gz\", " +
+ "\"non-standard-field\": [\"yes\", 2, 3], \"method\": \"POST\", \"requestBody\": \"x=1&y=2\"}",
+ state.canonicalizer);
+ }
+
+ @Benchmark
+ public Capture parseCdxjInfer(MyState state) {
+ return Capture.fromCdxLine("org,example)/?__wb_method=post&__wb_post_data=dGVzdAo= 20210203115119 {\"url\": \"https://example.org/\", " +
+ "\"mime\": \"unk\", \"status\": \"400\", \"digest\": \"3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\", " +
+ "\"length\": \"451\", \"offset\": \"90493\", \"filename\": \"example.warc.gz\", " +
+ "\"non-standard-field\": [\"yes\", 2, 3]}",
+ state.canonicalizer);
+ }
+
+ @Benchmark
+ public void encodeV3(MyState state, Blackhole blackhole) {
+ blackhole.consume(state.capture.encodeKey(3));
+ blackhole.consume(state.capture.encodeValue(3));
+ }
+
+ @Benchmark
+ public void encodeV5(MyState state, Blackhole blackhole) {
+ blackhole.consume(state.capture.encodeKey(5));
+ blackhole.consume(state.capture.encodeValue(5));
+ }
+
+ @Benchmark
+ public void decodeV3(MyState state, Blackhole blackhole) {
+ blackhole.consume(new Capture(state.keyV3, state.valueV3));
+ }
+
+ @Benchmark
+ public void decodeV5(MyState state, Blackhole blackhole) {
+ blackhole.consume(new Capture(state.keyV5, state.valueV5));
+ }
+
+ public static void main(String[] args) throws IOException {
+ org.openjdk.jmh.Main.main(args);
+ }
+}
diff --git a/test/outbackcdx/CaptureTest.java b/test/outbackcdx/CaptureTest.java
index 74430b5..926b715 100644
--- a/test/outbackcdx/CaptureTest.java
+++ b/test/outbackcdx/CaptureTest.java
@@ -3,6 +3,7 @@
import org.junit.Test;
import java.util.Arrays;
+
import static org.junit.Assert.assertEquals;
public class CaptureTest {
@@ -51,12 +52,16 @@ public void testCdxDigestScheme() {
@Test
public void testCdxj() {
- Capture src = Capture.fromCdxLine("- 20210203115119 {\"url\": \"https://example.org/robots.txt\", \"mime\": \"unk\", \"status\": \"400\", \"digest\": \"3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\", \"length\": \"451\", \"offset\": \"90493\", \"filename\": \"example.warc.gz\", \"non-standard-field\": [\"yes\", 2, 3]}", new UrlCanonicalizer());
+ Capture src = Capture.fromCdxLine("- 20210203115119 {\"url\": \"https://example.org/robots.txt\", " +
+ "\"mime\": \"unk\", \"status\": \"400\", \"digest\": \"3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\", " +
+ "\"length\": \"451\", \"offset\": \"90493\", \"filename\": \"example.warc.gz\", " +
+ "\"non-standard-field\": [\"yes\", 2, 3], \"method\": \"POST\", \"requestBody\": \"x=1&y=2\"}",
+ new UrlCanonicalizer());
Capture dst = new Capture(src.encodeKey(5), src.encodeValue(5));
assertEquals(451, src.length);
assertEquals(90493, src.compressedoffset);
assertFieldsEqual(src, dst);
- assertEquals("org,example)/robots.txt 20210203115119 https://example.org/robots.txt unk 400 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 451 90493 example.warc.gz - - -", dst.toString());
+ assertEquals("org,example)/robots.txt?__wb_method=post&x=1&y=2 20210203115119 https://example.org/robots.txt unk 400 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 451 90493 example.warc.gz - - -", dst.toString());
assertEquals(Arrays.asList("yes", 2, 3), dst.get("non-standard-field"));
}
@@ -74,12 +79,14 @@ public void testWbPostDataExtraction() {
UrlCanonicalizer canonicalizer = new UrlCanonicalizer();
// simple url with no parameters
- Capture cap = Capture.fromCdxLine("com,test)/append?__wb_post_data=dGVzdAo= 20200528143535 https://test.com/append application/json 202 2WC5VZGPEJIVA6BQPKMISFH7ISBVWYUQ - - 467 4846509 test.warc.gz", canonicalizer);
- assertEquals("com,test)/append?__wb_post_data=dGVzdAo=", cap.urlkey);
+ Capture cap = Capture.fromCdxLine("com,test)/append?__wb_method=post&__wb_post_data=dGVzdAo= 20200528143535 https://test.com/append application/json 202 2WC5VZGPEJIVA6BQPKMISFH7ISBVWYUQ - - 467 4846509 test.warc.gz", canonicalizer);
+ assertEquals("com,test)/append?__wb_method=post&__wb_post_data=dgvzdao=", cap.urlkey);
+ assertEquals("POST", cap.get("method"));
+ assertEquals("__wb_post_data=dGVzdAo=", cap.get("requestBody"));
// url with parameters
- cap = Capture.fromCdxLine("com,test)/append?x=1&__wb_post_data=dGVzdAo= 20200528143535 https://test.com/append?x=1 application/json 202 2WC5VZGPEJIVA6BQPKMISFH7ISBVWYUQ - - 467 4846509 test.warc.gz", canonicalizer);
- assertEquals("com,test)/append?x=1&__wb_post_data=dGVzdAo=", cap.urlkey);
+ cap = Capture.fromCdxLine("com,test)/append?x=1&__wb_method=post&__wb_post_data=dGVzdAo= 20200528143535 https://test.com/append?x=1 application/json 202 2WC5VZGPEJIVA6BQPKMISFH7ISBVWYUQ - - 467 4846509 test.warc.gz", canonicalizer);
+ assertEquals("com,test)/append?__wb_method=post&__wb_post_data=dgvzdao=&x=1", cap.urlkey);
}
@Test
diff --git a/test/outbackcdx/IndexTest.java b/test/outbackcdx/IndexTest.java
index 07a65cb..1f94e35 100644
--- a/test/outbackcdx/IndexTest.java
+++ b/test/outbackcdx/IndexTest.java
@@ -62,14 +62,14 @@ public void testClosest() throws IOException {
@Test
public void testPostData() throws IOException {
try (Index.Batch batch = index.beginUpdate()) {
- batch.putCapture(Capture.fromCdxLine("org,post)/?__wb_post_data=dGVzdAo= 20200528143307 http://post.org/ text/html 200 - - 0 w1", index.canonicalizer));
- batch.putCapture(Capture.fromCdxLine("org,post)/?__wb_post_data=dGVzdDIK 20200528143307 http://post.org/ text/html 200 - - 0 w1", index.canonicalizer));
+ batch.putCapture(Capture.fromCdxLine("org,post)/?__wb_method=post&__wb_post_data=dGVzdAo= 20200528143307 http://post.org/ text/html 200 - - 0 w1", index.canonicalizer));
+ batch.putCapture(Capture.fromCdxLine("org,post)/?__wb_method=post&__wb_post_data=dGVzdDIK 20200528143307 http://post.org/ text/html 200 - - 0 w1", index.canonicalizer));
batch.commit();
}
List results = new ArrayList<>();
- index.closestQuery("org,post)/?__wb_post_data=dGVzdAo=", 20200528143307L, null).forEach(results::add);
- assertEquals("org,post)/?__wb_post_data=dGVzdAo=", results.get(0).urlkey);
+ index.closestQuery("org,post)/?__wb_method=post&__wb_post_data=dgvzdao=", 20200528143307L, null).forEach(results::add);
+ assertEquals("org,post)/?__wb_method=post&__wb_post_data=dgvzdao=", results.get(0).urlkey);
}
@Test