Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Add method field to CDXJ #107

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/outbackcdx/Capture.java
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ public class Capture {
public long originalCompressedoffset = -1;
public String originalFile = "-";

// pywb adds this to the CDXJ for POST requests
public String method = "GET";

protected static Pattern URLKEY_POSTDATA_REGEX =
Pattern.compile("[?&](__wb_post_data|__warc_post_data)=([^&]+).*$", Pattern.CASE_INSENSITIVE);

Expand Down Expand Up @@ -544,6 +547,9 @@ private void put(String field, Object value) {
case "originalFilename":
originalFile = coerceString(value);
break;
case "method":
method = coerceString(value);
break;
default:
throw new IllegalArgumentException("no such capture field: " + field);
}
Expand Down Expand Up @@ -597,6 +603,8 @@ public Object get(String field) {
} else {
return "bytes=" + compressedoffset + "-" + (compressedoffset + length - 1);
}
case "method":
return method;
default:
throw new IllegalArgumentException("no such capture field: " + field);
}
Expand Down
3 changes: 2 additions & 1 deletion test/outbackcdx/CaptureTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,11 @@ public void testCdxDigestScheme() {

@Test
public void testCdxj() {
Capture src = Capture.fromCdxLine("com,example)/robots.txt 20210203115119 {\"url\": \"https://example.org/robots.txt\", \"mime\": \"unk\", \"status\": \"400\", \"digest\": \"3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\", \"length\": \"451\", \"offset\": \"90493\", \"filename\": \"example.warc.gz\"}", new UrlCanonicalizer());
Capture src = Capture.fromCdxLine("com,example)/robots.txt 20210203115119 {\"url\": \"https://example.org/robots.txt\", \"mime\": \"unk\", \"status\": \"400\", \"digest\": \"3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\", \"length\": \"451\", \"offset\": \"90493\", \"filename\": \"example.warc.gz\", \"method\": \"POST\"}", new UrlCanonicalizer());
Capture dst = new Capture(src.encodeKey(), src.encodeValue());
assertEquals(451, src.length);
assertEquals(90493, src.compressedoffset);
assertEquals("POST", src.method);
assertFieldsEqual(src, dst);
assertEquals("com,example)/robots.txt 20210203115119 https://example.org/robots.txt unk 400 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 451 90493 example.warc.gz - - -", dst.toString());
}
Expand Down