Skip to content

Commit

Permalink
Support CDXJ method and requestBody fields encoded in urlkey
Browse files Browse the repository at this point in the history
This should improve compatibility with Pywb for POST and PUT requests.

#106
  • Loading branch information
ato committed Jun 9, 2023
1 parent d2370cf commit fb0e289
Show file tree
Hide file tree
Showing 5 changed files with 223 additions and 42 deletions.
12 changes: 12 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -166,5 +166,17 @@
<version>2.6</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.openjdk.jmh</groupId>
<artifactId>jmh-core</artifactId>
<version>1.36</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.openjdk.jmh</groupId>
<artifactId>jmh-generator-annprocess</artifactId>
<version>1.36</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
148 changes: 116 additions & 32 deletions src/outbackcdx/Capture.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,9 @@
import java.time.LocalDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static java.nio.charset.StandardCharsets.US_ASCII;
import static java.nio.charset.StandardCharsets.UTF_8;
Expand Down Expand Up @@ -75,9 +71,6 @@ public class Capture {
public String originalFile = "-";
Map<String,Object> extra;

protected static Pattern URLKEY_POSTDATA_REGEX =
Pattern.compile("[?&](__wb_post_data|__warc_post_data)=([^&]+).*$", Pattern.CASE_INSENSITIVE);

public Capture(Map.Entry<byte[], byte[]> entry) {
this(entry.getKey(), entry.getValue());
}
Expand Down Expand Up @@ -301,8 +294,10 @@ public byte[] encodeValue(int version) {
}
}

private static final Set<String> INFERRABLE_EXTRA_FIELDS = new HashSet<>(Arrays.asList("method", "requestBody"));

private void ensureNoExtraFields() {
if (extra != null && !extra.isEmpty()) {
if (extra != null && !extra.isEmpty() && !INFERRABLE_EXTRA_FIELDS.containsAll(extra.keySet())) {
throw new IllegalStateException("Can't encode capture with extra (CDXJ) fields in index version < 5");
}
}
Expand Down Expand Up @@ -406,27 +401,6 @@ public String toString() {
return out.toString();
}

/**
* If post data is available in urlkey, appends it to original url
* @param urlkey urlkey as passed in cdx line
* @param surt outbackcdx canonized surt
* @return surt with post-data appended
*/
private static String appendWbPostData(String urlkey, String surt) {
Matcher matchKey = URLKEY_POSTDATA_REGEX.matcher(urlkey);
Matcher matchOriginal = URLKEY_POSTDATA_REGEX.matcher(surt);

if (matchKey.find() && !matchOriginal.matches() && matchKey.groupCount() > 1) {
StringBuilder sb = new StringBuilder(surt);
sb.append( surt.indexOf('?') < 0 ? '?' : '&' );
sb.append(matchKey.group(1));
sb.append("=");
sb.append(matchKey.group(2));
return sb.toString();
}
return surt;
}

public static Capture fromCdxLine(String line, UrlCanonicalizer canonicalizer) {
String[] fields = line.split(" ");
if (fields.length > 2 && fields[2].startsWith("{")) {
Expand All @@ -436,7 +410,8 @@ public static Capture fromCdxLine(String line, UrlCanonicalizer canonicalizer) {
Capture capture = new Capture();
capture.timestamp = parseCdxTimestamp(fields[1]);
capture.original = fields[2];
capture.urlkey = appendWbPostData(fields[0], canonicalizer.surtCanonicalize(capture.original));
capture.inferMethodAndRequestBodyFromOldUrlKey(fields[0], canonicalizer);
capture.urlkey = capture.generateUrlKey(canonicalizer);
capture.mimetype = fields[3];
capture.status = fields[4].equals("-") ? 0 : Integer.parseInt(fields[4]);
capture.digest = fields[5];
Expand Down Expand Up @@ -488,10 +463,119 @@ private static Capture fromCdxjLine(String line, UrlCanonicalizer canonicalizer)
if (capture.original == null) {
throw new IllegalArgumentException("Missing 'url' field in CDXJ line: " + line);
}
capture.urlkey = appendWbPostData(fixedFields[0], canonicalizer.surtCanonicalize(capture.original));
capture.inferMethodAndRequestBodyFromOldUrlKey(fixedFields[0], canonicalizer);
capture.urlkey = capture.generateUrlKey(canonicalizer);
return capture;
}

private String getExtraString(String field) {
if (extra != null) {
Object value = extra.get(field);
if (value instanceof String) {
return (String) value;
}
}
return null;
}

private String[] extractQueryParams(String url) {
int queryIndex = url.indexOf('?');
if (queryIndex == -1) {
return new String[0];
}
String query = url.substring(queryIndex + 1);
return query.split("&");
}

/**
* Returns the strings that are in a but not in b. a and b most both be sorted.
*/
private List<String> diffParams(String[] a, String[] b) {
List<String> result = new ArrayList<>();
int i = 0;
int j = 0;
while (i < a.length && j < b.length) {
int cmp = a[i].compareTo(b[j]);
if (cmp < 0) {
result.add(a[i]);
i++;
} else if (cmp > 0) {
j++;
} else {
i++;
j++;
}
}
while (i < a.length) {
result.add(a[i]);
i++;
}
return result;

}

/**
* Attempts to infer the method and requestBody extra fields by comparing an old urlkey against the original url.
* <p>
* When run with the --post-append option, webrecorder/cdxj-indexer will include the request method and encoded
* version of the request body as query parameters in the url key. In CDXJ output mode it will usually also
* add extra "method" and "requestBody" fields for these values. However, in CDX11 output mode there no extra
* fields. Older versions of cdxj-indexer in CDXJ mode also didn't populate the extra fields.
* <p>
* We can determine the fields from request body because they won't appear in the original url query string.
* <p>
* This method does nothing if the extra fields are already populated.
*/
private void inferMethodAndRequestBodyFromOldUrlKey(String oldUrlKey, UrlCanonicalizer canonicalizer) {
if (oldUrlKey == null) return;
if (!oldUrlKey.contains("__wb_method=")) return;
if (extra != null) {
if (extra.containsKey("method")) return;
if (extra.containsKey("requestBody")) return;
}

// if the old urlkey contains __wb_method but we don't have method and requestBody,
// then we try to extract them from the urlkey by looking for query parameters that appear
// in the old urlkey but aren't present in the original url field.
String[] oldParams = extractQueryParams(oldUrlKey);
String newUrlKey = canonicalizer.surtCanonicalize(original);
String[] newParams = extractQueryParams(newUrlKey);
List<String> extraParams = diffParams(oldParams, newParams);
StringBuilder builder = new StringBuilder();
for (String param: extraParams) {
if (param.startsWith("__wb_method=")) {
put("method", param.substring("__wb_method=".length()).toUpperCase(Locale.ROOT));
} else {
// probably a request body parameter
if (builder.length() > 0) {
builder.append("&");
}
builder.append(param);
}
}
if (builder.length() > 0) {
put("requestBody", builder.toString());
}
}

private String generateUrlKey(UrlCanonicalizer canonicalizer) {
String method = getExtraString("method");
String requestBody = getExtraString("requestBody");
String url;
if ("POST".equalsIgnoreCase(method) || "PUT".equalsIgnoreCase(method)) {
StringBuilder builder = new StringBuilder(original);
builder.append(original.contains("?") ? "&" : "?");
builder.append("__wb_method=").append(method);
if (requestBody != null && !requestBody.isEmpty()) {
builder.append("&").append(requestBody);
}
url = builder.toString();
} else {
url = original;
}
return canonicalizer.surtCanonicalize(url);
}

/**
* Convert a 14 digit CDX timestamp into a 64 bit integer (long). If the supplied string is too short, 0 will be
* appended to pad it out. If the supplied string is to long, an exception will be thrown.
Expand Down
78 changes: 78 additions & 0 deletions test/outbackcdx/CaptureBenchmark.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
package outbackcdx;

import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.infra.Blackhole;

import java.io.IOException;

public class CaptureBenchmark {
@State(Scope.Benchmark)
public static class MyState {
UrlCanonicalizer canonicalizer = new UrlCanonicalizer();
Capture capture = Capture.fromCdxLine("- 19870102030405 http://example.org/ text/html 200 sha1:M5ORM4XQ5QCEZEDRNZRGSWXPCOGUVASI - 100 test.warc.gz", canonicalizer);
Capture capture2 = Capture.fromCdxLine("- 20210203115119 {\"url\": \"https://example.org/robots.txt\", " +
"\"mime\": \"unk\", \"status\": \"400\", \"digest\": \"3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\", " +
"\"length\": \"451\", \"offset\": \"90493\", \"filename\": \"example.warc.gz\", \"method\": \"POST\", \"requestBody\": \"x=1&y=2\"}",
canonicalizer);
byte[] keyV3 = capture2.encodeKey(3);
byte[] keyV5 = capture2.encodeKey(5);
byte[] valueV3 = capture2.encodeValue(3);
byte[] valueV5 = capture2.encodeValue(5);
}

@Benchmark
public Capture parseCdx(MyState state) {
return Capture.fromCdxLine("org,example)/ 19870102030405 http://example.org/ text/html 200 sha1:M5ORM4XQ5QCEZEDRNZRGSWXPCOGUVASI - 100 test.warc.gz", state.canonicalizer);
}

@Benchmark
public Capture parseCdxInfer(MyState state) {
return Capture.fromCdxLine("org,example)/?__wb_method=post&__wb_post_data=dGVzdAo= 19870102030405 http://example.org/ text/html 200 sha1:M5ORM4XQ5QCEZEDRNZRGSWXPCOGUVASI - 100 test.warc.gz", state.canonicalizer);
}

@Benchmark
public Capture parseCdxj(MyState state) {
return Capture.fromCdxLine("org,example)/ 20210203115119 {\"url\": \"https://example.org/\", " +
"\"mime\": \"unk\", \"status\": \"400\", \"digest\": \"3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\", " +
"\"length\": \"451\", \"offset\": \"90493\", \"filename\": \"example.warc.gz\", " +
"\"non-standard-field\": [\"yes\", 2, 3], \"method\": \"POST\", \"requestBody\": \"x=1&y=2\"}",
state.canonicalizer);
}

@Benchmark
public Capture parseCdxjInfer(MyState state) {
return Capture.fromCdxLine("org,example)/?__wb_method=post&__wb_post_data=dGVzdAo= 20210203115119 {\"url\": \"https://example.org/\", " +
"\"mime\": \"unk\", \"status\": \"400\", \"digest\": \"3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\", " +
"\"length\": \"451\", \"offset\": \"90493\", \"filename\": \"example.warc.gz\", " +
"\"non-standard-field\": [\"yes\", 2, 3]}",
state.canonicalizer);
}

@Benchmark
public void encodeV3(MyState state, Blackhole blackhole) {
blackhole.consume(state.capture.encodeKey(3));
blackhole.consume(state.capture.encodeValue(3));
}

@Benchmark
public void encodeV5(MyState state, Blackhole blackhole) {
blackhole.consume(state.capture.encodeKey(5));
blackhole.consume(state.capture.encodeValue(5));
}

@Benchmark
public void decodeV3(MyState state, Blackhole blackhole) {
blackhole.consume(new Capture(state.keyV3, state.valueV3));
}

@Benchmark
public void decodeV5(MyState state, Blackhole blackhole) {
blackhole.consume(new Capture(state.keyV5, state.valueV5));
}

public static void main(String[] args) throws IOException {
org.openjdk.jmh.Main.main(args);
}
}
19 changes: 13 additions & 6 deletions test/outbackcdx/CaptureTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import org.junit.Test;

import java.util.Arrays;

import static org.junit.Assert.assertEquals;

public class CaptureTest {
Expand Down Expand Up @@ -51,12 +52,16 @@ public void testCdxDigestScheme() {

@Test
public void testCdxj() {
Capture src = Capture.fromCdxLine("- 20210203115119 {\"url\": \"https://example.org/robots.txt\", \"mime\": \"unk\", \"status\": \"400\", \"digest\": \"3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\", \"length\": \"451\", \"offset\": \"90493\", \"filename\": \"example.warc.gz\", \"non-standard-field\": [\"yes\", 2, 3]}", new UrlCanonicalizer());
Capture src = Capture.fromCdxLine("- 20210203115119 {\"url\": \"https://example.org/robots.txt\", " +
"\"mime\": \"unk\", \"status\": \"400\", \"digest\": \"3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\", " +
"\"length\": \"451\", \"offset\": \"90493\", \"filename\": \"example.warc.gz\", " +
"\"non-standard-field\": [\"yes\", 2, 3], \"method\": \"POST\", \"requestBody\": \"x=1&y=2\"}",
new UrlCanonicalizer());
Capture dst = new Capture(src.encodeKey(5), src.encodeValue(5));
assertEquals(451, src.length);
assertEquals(90493, src.compressedoffset);
assertFieldsEqual(src, dst);
assertEquals("org,example)/robots.txt 20210203115119 https://example.org/robots.txt unk 400 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 451 90493 example.warc.gz - - -", dst.toString());
assertEquals("org,example)/robots.txt?__wb_method=post&x=1&y=2 20210203115119 https://example.org/robots.txt unk 400 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 451 90493 example.warc.gz - - -", dst.toString());
assertEquals(Arrays.asList("yes", 2, 3), dst.get("non-standard-field"));
}

Expand All @@ -74,12 +79,14 @@ public void testWbPostDataExtraction() {
UrlCanonicalizer canonicalizer = new UrlCanonicalizer();

// simple url with no parameters
Capture cap = Capture.fromCdxLine("com,test)/append?__wb_post_data=dGVzdAo= 20200528143535 https://test.com/append application/json 202 2WC5VZGPEJIVA6BQPKMISFH7ISBVWYUQ - - 467 4846509 test.warc.gz", canonicalizer);
assertEquals("com,test)/append?__wb_post_data=dGVzdAo=", cap.urlkey);
Capture cap = Capture.fromCdxLine("com,test)/append?__wb_method=post&__wb_post_data=dGVzdAo= 20200528143535 https://test.com/append application/json 202 2WC5VZGPEJIVA6BQPKMISFH7ISBVWYUQ - - 467 4846509 test.warc.gz", canonicalizer);
assertEquals("com,test)/append?__wb_method=post&__wb_post_data=dgvzdao=", cap.urlkey);
assertEquals("POST", cap.get("method"));
assertEquals("__wb_post_data=dGVzdAo=", cap.get("requestBody"));

// url with parameters
cap = Capture.fromCdxLine("com,test)/append?x=1&__wb_post_data=dGVzdAo= 20200528143535 https://test.com/append?x=1 application/json 202 2WC5VZGPEJIVA6BQPKMISFH7ISBVWYUQ - - 467 4846509 test.warc.gz", canonicalizer);
assertEquals("com,test)/append?x=1&__wb_post_data=dGVzdAo=", cap.urlkey);
cap = Capture.fromCdxLine("com,test)/append?x=1&__wb_method=post&__wb_post_data=dGVzdAo= 20200528143535 https://test.com/append?x=1 application/json 202 2WC5VZGPEJIVA6BQPKMISFH7ISBVWYUQ - - 467 4846509 test.warc.gz", canonicalizer);
assertEquals("com,test)/append?__wb_method=post&__wb_post_data=dgvzdao=&x=1", cap.urlkey);
}

@Test
Expand Down
8 changes: 4 additions & 4 deletions test/outbackcdx/IndexTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,14 @@ public void testClosest() throws IOException {
@Test
public void testPostData() throws IOException {
try (Index.Batch batch = index.beginUpdate()) {
batch.putCapture(Capture.fromCdxLine("org,post)/?__wb_post_data=dGVzdAo= 20200528143307 http://post.org/ text/html 200 - - 0 w1", index.canonicalizer));
batch.putCapture(Capture.fromCdxLine("org,post)/?__wb_post_data=dGVzdDIK 20200528143307 http://post.org/ text/html 200 - - 0 w1", index.canonicalizer));
batch.putCapture(Capture.fromCdxLine("org,post)/?__wb_method=post&__wb_post_data=dGVzdAo= 20200528143307 http://post.org/ text/html 200 - - 0 w1", index.canonicalizer));
batch.putCapture(Capture.fromCdxLine("org,post)/?__wb_method=post&__wb_post_data=dGVzdDIK 20200528143307 http://post.org/ text/html 200 - - 0 w1", index.canonicalizer));
batch.commit();
}

List<Capture> results = new ArrayList<>();
index.closestQuery("org,post)/?__wb_post_data=dGVzdAo=", 20200528143307L, null).forEach(results::add);
assertEquals("org,post)/?__wb_post_data=dGVzdAo=", results.get(0).urlkey);
index.closestQuery("org,post)/?__wb_method=post&__wb_post_data=dgvzdao=", 20200528143307L, null).forEach(results::add);
assertEquals("org,post)/?__wb_method=post&__wb_post_data=dgvzdao=", results.get(0).urlkey);
}

@Test
Expand Down

0 comments on commit fb0e289

Please # to comment.