diff --git a/pom.xml b/pom.xml index e5fd865..9c8518a 100644 --- a/pom.xml +++ b/pom.xml @@ -166,5 +166,17 @@ 2.6 test + + org.openjdk.jmh + jmh-core + 1.36 + test + + + org.openjdk.jmh + jmh-generator-annprocess + 1.36 + test + diff --git a/src/outbackcdx/Capture.java b/src/outbackcdx/Capture.java index 2e9287d..e7af0bd 100644 --- a/src/outbackcdx/Capture.java +++ b/src/outbackcdx/Capture.java @@ -16,13 +16,9 @@ import java.time.LocalDateTime; import java.time.ZoneOffset; import java.time.format.DateTimeFormatter; -import java.util.Date; -import java.util.HashMap; -import java.util.Map; +import java.util.*; import java.util.logging.Level; import java.util.logging.Logger; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import static java.nio.charset.StandardCharsets.US_ASCII; import static java.nio.charset.StandardCharsets.UTF_8; @@ -75,9 +71,6 @@ public class Capture { public String originalFile = "-"; Map extra; - protected static Pattern URLKEY_POSTDATA_REGEX = - Pattern.compile("[?&](__wb_post_data|__warc_post_data)=([^&]+).*$", Pattern.CASE_INSENSITIVE); - public Capture(Map.Entry entry) { this(entry.getKey(), entry.getValue()); } @@ -301,8 +294,10 @@ public byte[] encodeValue(int version) { } } + private static final Set INFERRABLE_EXTRA_FIELDS = new HashSet<>(Arrays.asList("method", "requestBody")); + private void ensureNoExtraFields() { - if (extra != null && !extra.isEmpty()) { + if (extra != null && !extra.isEmpty() && !INFERRABLE_EXTRA_FIELDS.containsAll(extra.keySet())) { throw new IllegalStateException("Can't encode capture with extra (CDXJ) fields in index version < 5"); } } @@ -406,27 +401,6 @@ public String toString() { return out.toString(); } - /** - * If post data is available in urlkey, appends it to original url - * @param urlkey urlkey as passed in cdx line - * @param surt outbackcdx canonized surt - * @return surt with post-data appended - */ - private static String appendWbPostData(String urlkey, String surt) { - Matcher matchKey = URLKEY_POSTDATA_REGEX.matcher(urlkey); - Matcher matchOriginal = URLKEY_POSTDATA_REGEX.matcher(surt); - - if (matchKey.find() && !matchOriginal.matches() && matchKey.groupCount() > 1) { - StringBuilder sb = new StringBuilder(surt); - sb.append( surt.indexOf('?') < 0 ? '?' : '&' ); - sb.append(matchKey.group(1)); - sb.append("="); - sb.append(matchKey.group(2)); - return sb.toString(); - } - return surt; - } - public static Capture fromCdxLine(String line, UrlCanonicalizer canonicalizer) { String[] fields = line.split(" "); if (fields.length > 2 && fields[2].startsWith("{")) { @@ -436,7 +410,8 @@ public static Capture fromCdxLine(String line, UrlCanonicalizer canonicalizer) { Capture capture = new Capture(); capture.timestamp = parseCdxTimestamp(fields[1]); capture.original = fields[2]; - capture.urlkey = appendWbPostData(fields[0], canonicalizer.surtCanonicalize(capture.original)); + capture.inferMethodAndRequestBodyFromOldUrlKey(fields[0], canonicalizer); + capture.urlkey = capture.generateUrlKey(canonicalizer); capture.mimetype = fields[3]; capture.status = fields[4].equals("-") ? 0 : Integer.parseInt(fields[4]); capture.digest = fields[5]; @@ -488,10 +463,119 @@ private static Capture fromCdxjLine(String line, UrlCanonicalizer canonicalizer) if (capture.original == null) { throw new IllegalArgumentException("Missing 'url' field in CDXJ line: " + line); } - capture.urlkey = appendWbPostData(fixedFields[0], canonicalizer.surtCanonicalize(capture.original)); + capture.inferMethodAndRequestBodyFromOldUrlKey(fixedFields[0], canonicalizer); + capture.urlkey = capture.generateUrlKey(canonicalizer); return capture; } + private String getExtraString(String field) { + if (extra != null) { + Object value = extra.get(field); + if (value instanceof String) { + return (String) value; + } + } + return null; + } + + private String[] extractQueryParams(String url) { + int queryIndex = url.indexOf('?'); + if (queryIndex == -1) { + return new String[0]; + } + String query = url.substring(queryIndex + 1); + return query.split("&"); + } + + /** + * Returns the strings that are in a but not in b. a and b most both be sorted. + */ + private List diffParams(String[] a, String[] b) { + List result = new ArrayList<>(); + int i = 0; + int j = 0; + while (i < a.length && j < b.length) { + int cmp = a[i].compareTo(b[j]); + if (cmp < 0) { + result.add(a[i]); + i++; + } else if (cmp > 0) { + j++; + } else { + i++; + j++; + } + } + while (i < a.length) { + result.add(a[i]); + i++; + } + return result; + + } + + /** + * Attempts to infer the method and requestBody extra fields by comparing an old urlkey against the original url. + *

+ * When run with the --post-append option, webrecorder/cdxj-indexer will include the request method and encoded + * version of the request body as query parameters in the url key. In CDXJ output mode it will usually also + * add extra "method" and "requestBody" fields for these values. However, in CDX11 output mode there no extra + * fields. Older versions of cdxj-indexer in CDXJ mode also didn't populate the extra fields. + *

+ * We can determine the fields from request body because they won't appear in the original url query string. + *

+ * This method does nothing if the extra fields are already populated. + */ + private void inferMethodAndRequestBodyFromOldUrlKey(String oldUrlKey, UrlCanonicalizer canonicalizer) { + if (oldUrlKey == null) return; + if (!oldUrlKey.contains("__wb_method=")) return; + if (extra != null) { + if (extra.containsKey("method")) return; + if (extra.containsKey("requestBody")) return; + } + + // if the old urlkey contains __wb_method but we don't have method and requestBody, + // then we try to extract them from the urlkey by looking for query parameters that appear + // in the old urlkey but aren't present in the original url field. + String[] oldParams = extractQueryParams(oldUrlKey); + String newUrlKey = canonicalizer.surtCanonicalize(original); + String[] newParams = extractQueryParams(newUrlKey); + List extraParams = diffParams(oldParams, newParams); + StringBuilder builder = new StringBuilder(); + for (String param: extraParams) { + if (param.startsWith("__wb_method=")) { + put("method", param.substring("__wb_method=".length()).toUpperCase(Locale.ROOT)); + } else { + // probably a request body parameter + if (builder.length() > 0) { + builder.append("&"); + } + builder.append(param); + } + } + if (builder.length() > 0) { + put("requestBody", builder.toString()); + } + } + + private String generateUrlKey(UrlCanonicalizer canonicalizer) { + String method = getExtraString("method"); + String requestBody = getExtraString("requestBody"); + String url; + if ("POST".equalsIgnoreCase(method) || "PUT".equalsIgnoreCase(method)) { + StringBuilder builder = new StringBuilder(original); + builder.append(original.contains("?") ? "&" : "?"); + builder.append("__wb_method=").append(method); + if (requestBody != null && !requestBody.isEmpty()) { + builder.append("&").append(requestBody); + } + url = builder.toString(); + } else { + url = original; + } + return canonicalizer.surtCanonicalize(url); + } + /** * Convert a 14 digit CDX timestamp into a 64 bit integer (long). If the supplied string is too short, 0 will be * appended to pad it out. If the supplied string is to long, an exception will be thrown. diff --git a/test/outbackcdx/CaptureBenchmark.java b/test/outbackcdx/CaptureBenchmark.java new file mode 100644 index 0000000..f3874a4 --- /dev/null +++ b/test/outbackcdx/CaptureBenchmark.java @@ -0,0 +1,78 @@ +package outbackcdx; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.infra.Blackhole; + +import java.io.IOException; + +public class CaptureBenchmark { + @State(Scope.Benchmark) + public static class MyState { + UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); + Capture capture = Capture.fromCdxLine("- 19870102030405 http://example.org/ text/html 200 sha1:M5ORM4XQ5QCEZEDRNZRGSWXPCOGUVASI - 100 test.warc.gz", canonicalizer); + Capture capture2 = Capture.fromCdxLine("- 20210203115119 {\"url\": \"https://example.org/robots.txt\", " + + "\"mime\": \"unk\", \"status\": \"400\", \"digest\": \"3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\", " + + "\"length\": \"451\", \"offset\": \"90493\", \"filename\": \"example.warc.gz\", \"method\": \"POST\", \"requestBody\": \"x=1&y=2\"}", + canonicalizer); + byte[] keyV3 = capture2.encodeKey(3); + byte[] keyV5 = capture2.encodeKey(5); + byte[] valueV3 = capture2.encodeValue(3); + byte[] valueV5 = capture2.encodeValue(5); + } + + @Benchmark + public Capture parseCdx(MyState state) { + return Capture.fromCdxLine("org,example)/ 19870102030405 http://example.org/ text/html 200 sha1:M5ORM4XQ5QCEZEDRNZRGSWXPCOGUVASI - 100 test.warc.gz", state.canonicalizer); + } + + @Benchmark + public Capture parseCdxInfer(MyState state) { + return Capture.fromCdxLine("org,example)/?__wb_method=post&__wb_post_data=dGVzdAo= 19870102030405 http://example.org/ text/html 200 sha1:M5ORM4XQ5QCEZEDRNZRGSWXPCOGUVASI - 100 test.warc.gz", state.canonicalizer); + } + + @Benchmark + public Capture parseCdxj(MyState state) { + return Capture.fromCdxLine("org,example)/ 20210203115119 {\"url\": \"https://example.org/\", " + + "\"mime\": \"unk\", \"status\": \"400\", \"digest\": \"3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\", " + + "\"length\": \"451\", \"offset\": \"90493\", \"filename\": \"example.warc.gz\", " + + "\"non-standard-field\": [\"yes\", 2, 3], \"method\": \"POST\", \"requestBody\": \"x=1&y=2\"}", + state.canonicalizer); + } + + @Benchmark + public Capture parseCdxjInfer(MyState state) { + return Capture.fromCdxLine("org,example)/?__wb_method=post&__wb_post_data=dGVzdAo= 20210203115119 {\"url\": \"https://example.org/\", " + + "\"mime\": \"unk\", \"status\": \"400\", \"digest\": \"3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\", " + + "\"length\": \"451\", \"offset\": \"90493\", \"filename\": \"example.warc.gz\", " + + "\"non-standard-field\": [\"yes\", 2, 3]}", + state.canonicalizer); + } + + @Benchmark + public void encodeV3(MyState state, Blackhole blackhole) { + blackhole.consume(state.capture.encodeKey(3)); + blackhole.consume(state.capture.encodeValue(3)); + } + + @Benchmark + public void encodeV5(MyState state, Blackhole blackhole) { + blackhole.consume(state.capture.encodeKey(5)); + blackhole.consume(state.capture.encodeValue(5)); + } + + @Benchmark + public void decodeV3(MyState state, Blackhole blackhole) { + blackhole.consume(new Capture(state.keyV3, state.valueV3)); + } + + @Benchmark + public void decodeV5(MyState state, Blackhole blackhole) { + blackhole.consume(new Capture(state.keyV5, state.valueV5)); + } + + public static void main(String[] args) throws IOException { + org.openjdk.jmh.Main.main(args); + } +} diff --git a/test/outbackcdx/CaptureTest.java b/test/outbackcdx/CaptureTest.java index 74430b5..926b715 100644 --- a/test/outbackcdx/CaptureTest.java +++ b/test/outbackcdx/CaptureTest.java @@ -3,6 +3,7 @@ import org.junit.Test; import java.util.Arrays; + import static org.junit.Assert.assertEquals; public class CaptureTest { @@ -51,12 +52,16 @@ public void testCdxDigestScheme() { @Test public void testCdxj() { - Capture src = Capture.fromCdxLine("- 20210203115119 {\"url\": \"https://example.org/robots.txt\", \"mime\": \"unk\", \"status\": \"400\", \"digest\": \"3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\", \"length\": \"451\", \"offset\": \"90493\", \"filename\": \"example.warc.gz\", \"non-standard-field\": [\"yes\", 2, 3]}", new UrlCanonicalizer()); + Capture src = Capture.fromCdxLine("- 20210203115119 {\"url\": \"https://example.org/robots.txt\", " + + "\"mime\": \"unk\", \"status\": \"400\", \"digest\": \"3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\", " + + "\"length\": \"451\", \"offset\": \"90493\", \"filename\": \"example.warc.gz\", " + + "\"non-standard-field\": [\"yes\", 2, 3], \"method\": \"POST\", \"requestBody\": \"x=1&y=2\"}", + new UrlCanonicalizer()); Capture dst = new Capture(src.encodeKey(5), src.encodeValue(5)); assertEquals(451, src.length); assertEquals(90493, src.compressedoffset); assertFieldsEqual(src, dst); - assertEquals("org,example)/robots.txt 20210203115119 https://example.org/robots.txt unk 400 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 451 90493 example.warc.gz - - -", dst.toString()); + assertEquals("org,example)/robots.txt?__wb_method=post&x=1&y=2 20210203115119 https://example.org/robots.txt unk 400 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 451 90493 example.warc.gz - - -", dst.toString()); assertEquals(Arrays.asList("yes", 2, 3), dst.get("non-standard-field")); } @@ -74,12 +79,14 @@ public void testWbPostDataExtraction() { UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); // simple url with no parameters - Capture cap = Capture.fromCdxLine("com,test)/append?__wb_post_data=dGVzdAo= 20200528143535 https://test.com/append application/json 202 2WC5VZGPEJIVA6BQPKMISFH7ISBVWYUQ - - 467 4846509 test.warc.gz", canonicalizer); - assertEquals("com,test)/append?__wb_post_data=dGVzdAo=", cap.urlkey); + Capture cap = Capture.fromCdxLine("com,test)/append?__wb_method=post&__wb_post_data=dGVzdAo= 20200528143535 https://test.com/append application/json 202 2WC5VZGPEJIVA6BQPKMISFH7ISBVWYUQ - - 467 4846509 test.warc.gz", canonicalizer); + assertEquals("com,test)/append?__wb_method=post&__wb_post_data=dgvzdao=", cap.urlkey); + assertEquals("POST", cap.get("method")); + assertEquals("__wb_post_data=dGVzdAo=", cap.get("requestBody")); // url with parameters - cap = Capture.fromCdxLine("com,test)/append?x=1&__wb_post_data=dGVzdAo= 20200528143535 https://test.com/append?x=1 application/json 202 2WC5VZGPEJIVA6BQPKMISFH7ISBVWYUQ - - 467 4846509 test.warc.gz", canonicalizer); - assertEquals("com,test)/append?x=1&__wb_post_data=dGVzdAo=", cap.urlkey); + cap = Capture.fromCdxLine("com,test)/append?x=1&__wb_method=post&__wb_post_data=dGVzdAo= 20200528143535 https://test.com/append?x=1 application/json 202 2WC5VZGPEJIVA6BQPKMISFH7ISBVWYUQ - - 467 4846509 test.warc.gz", canonicalizer); + assertEquals("com,test)/append?__wb_method=post&__wb_post_data=dgvzdao=&x=1", cap.urlkey); } @Test diff --git a/test/outbackcdx/IndexTest.java b/test/outbackcdx/IndexTest.java index 07a65cb..1f94e35 100644 --- a/test/outbackcdx/IndexTest.java +++ b/test/outbackcdx/IndexTest.java @@ -62,14 +62,14 @@ public void testClosest() throws IOException { @Test public void testPostData() throws IOException { try (Index.Batch batch = index.beginUpdate()) { - batch.putCapture(Capture.fromCdxLine("org,post)/?__wb_post_data=dGVzdAo= 20200528143307 http://post.org/ text/html 200 - - 0 w1", index.canonicalizer)); - batch.putCapture(Capture.fromCdxLine("org,post)/?__wb_post_data=dGVzdDIK 20200528143307 http://post.org/ text/html 200 - - 0 w1", index.canonicalizer)); + batch.putCapture(Capture.fromCdxLine("org,post)/?__wb_method=post&__wb_post_data=dGVzdAo= 20200528143307 http://post.org/ text/html 200 - - 0 w1", index.canonicalizer)); + batch.putCapture(Capture.fromCdxLine("org,post)/?__wb_method=post&__wb_post_data=dGVzdDIK 20200528143307 http://post.org/ text/html 200 - - 0 w1", index.canonicalizer)); batch.commit(); } List results = new ArrayList<>(); - index.closestQuery("org,post)/?__wb_post_data=dGVzdAo=", 20200528143307L, null).forEach(results::add); - assertEquals("org,post)/?__wb_post_data=dGVzdAo=", results.get(0).urlkey); + index.closestQuery("org,post)/?__wb_method=post&__wb_post_data=dgvzdao=", 20200528143307L, null).forEach(results::add); + assertEquals("org,post)/?__wb_method=post&__wb_post_data=dgvzdao=", results.get(0).urlkey); } @Test