HTTP/1: Support streaming requests of unknown length (#506)

graebm · web-flow · commit e3a9cabc6646 · 2025-03-14T13:12:18.000-07:00
**Issue:**

aws-sdk-cpp needs to send streaming requests of unknown content length. This isn't currently supported in our HTTP/1 client. (our HTTP/2 client *does* already support this)

**Background:**

The HTTP/1 client already supported these use cases:
- Send streaming request body, but declare Content-Length up front
- Send chunked request body, but user must declare length of each chunk via `write_chunk()` API.

But it wasn't possible to send a stream of unknown length without doing extra copies (read to intermediate buffer, then send that buffer via write_chunk() API).

**Description of changes:**

Support input streams of unknown length via chunked encoding.

When reading from the stream, leave a bit of space at the start and end of the aws_io_message buffer for the chunk prefix and suffix. Then go back and write in the prefix and suffix afterwards. No unnecessary copies are involved.
diff --git a/include/aws/http/private/h1_encoder.h b/include/aws/http/private/h1_encoder.h
@@ -53,7 +53,12 @@ struct aws_h1_encoder_message {
 enum aws_h1_encoder_state {
     AWS_H1_ENCODER_STATE_INIT,
     AWS_H1_ENCODER_STATE_HEAD,
-    AWS_H1_ENCODER_STATE_UNCHUNKED_BODY,
+    /* Write streaming body, without chunked encoding, because Content-Length is known */
+    AWS_H1_ENCODER_STATE_UNCHUNKED_BODY_STREAM,
+    /* Write streaming body, with chunked encoding, because Content-Length is unknown */
+    AWS_H1_ENCODER_STATE_CHUNKED_BODY_STREAM,
+    AWS_H1_ENCODER_STATE_CHUNKED_BODY_STREAM_LAST_CHUNK,
+    /* The rest of the _CHUNK_ states support the write_chunk() API (body stream not provided up front) */
     AWS_H1_ENCODER_STATE_CHUNK_NEXT,
     AWS_H1_ENCODER_STATE_CHUNK_LINE,
     AWS_H1_ENCODER_STATE_CHUNK_BODY,
@@ -73,7 +78,7 @@ struct aws_h1_encoder {
     /* Current chunk */
     struct aws_h1_chunk *current_chunk;
     /* Number of chunks sent, just used for logging */
-    size_t chunk_count;
+    uint64_t chunk_count;
     /* Encoder logs with this stream ptr as the ID, and passes this ptr to the chunk_complete callback */
     struct aws_http_stream *current_stream;
 };
@@ -91,8 +96,6 @@ void aws_h1_chunk_destroy(struct aws_h1_chunk *chunk);
 /* Destroy chunk and fire its completion callback */
 void aws_h1_chunk_complete_and_destroy(struct aws_h1_chunk *chunk, struct aws_http_stream *http_stream, int error_code);
 
-int aws_chunk_line_from_options(struct aws_http1_chunk_options *options, struct aws_byte_buf *chunk_line);
-
 AWS_EXTERN_C_BEGIN
 
 /* Validate request and cache any info the encoder will need later in the "encoder message". */
diff --git a/include/aws/http/request_response.h b/include/aws/http/request_response.h
@@ -881,7 +881,8 @@ AWS_FUTURE_T_POINTER_WITH_RELEASE_DECLARATION(aws_future_http_message, struct aw
 
 /**
  * Submit a chunk of data to be sent on an HTTP/1.1 stream.
- * The stream must have specified "chunked" in a "transfer-encoding" header.
+ * The stream must have specified "chunked" in a "transfer-encoding" header,
+ * and the aws_http_message must NOT have any body stream set.
  * For client streams, activate() must be called before any chunks are submitted.
  * For server streams, the response must be submitted before any chunks.
  * A final chunk with size 0 must be submitted to successfully complete the HTTP-stream.
diff --git a/source/h1_encoder.c b/source/h1_encoder.c
@@ -127,14 +127,6 @@ static int s_scan_outgoing_headers(
         return aws_raise_error(AWS_ERROR_HTTP_INVALID_HEADER_FIELD);
     }
 
-    if (encoder_message->has_chunked_encoding_header && has_body_stream) {
-        AWS_LOGF_ERROR(
-            AWS_LS_HTTP_STREAM,
-            "id=static: Both Transfer-Encoding chunked header and body stream is set. "
-            "chunked data must use the chunk API to write the body stream.");
-        return aws_raise_error(AWS_ERROR_HTTP_INVALID_BODY_STREAM);
-    }
-
     if (body_headers_forbidden && (encoder_message->content_length > 0 || has_transfer_encoding_header)) {
         AWS_LOGF_ERROR(
             AWS_LS_HTTP_STREAM,
@@ -663,7 +655,7 @@ static int s_encode_stream(
         err = aws_input_stream_get_status(stream, &status);
         if (err) {
             ENCODER_LOGF(
-                TRACE,
+                ERROR,
                 encoder,
                 "Failed to query body stream status, error %d (%s)",
                 aws_last_error(),
@@ -729,7 +721,10 @@ static int s_state_fn_head(struct aws_h1_encoder *encoder, struct aws_byte_buf *
 
     /* Pick next state */
     if (encoder->message->body && encoder->message->content_length) {
-        return s_switch_state(encoder, AWS_H1_ENCODER_STATE_UNCHUNKED_BODY);
+        return s_switch_state(encoder, AWS_H1_ENCODER_STATE_UNCHUNKED_BODY_STREAM);
+
+    } else if (encoder->message->body && encoder->message->has_chunked_encoding_header) {
+        return s_switch_state(encoder, AWS_H1_ENCODER_STATE_CHUNKED_BODY_STREAM);
 
     } else if (encoder->message->has_chunked_encoding_header) {
         return s_switch_state(encoder, AWS_H1_ENCODER_STATE_CHUNK_NEXT);
@@ -739,8 +734,8 @@ static int s_state_fn_head(struct aws_h1_encoder *encoder, struct aws_byte_buf *
     }
 }
 
-/* Write out body (not using chunked encoding). */
-static int s_state_fn_unchunked_body(struct aws_h1_encoder *encoder, struct aws_byte_buf *dst) {
+/* Write out body with known Content-Length (not using chunked encoding). */
+static int s_state_fn_unchunked_body_stream(struct aws_h1_encoder *encoder, struct aws_byte_buf *dst) {
     bool done;
     if (s_encode_stream(encoder, dst, encoder->message->body, encoder->message->content_length, &done)) {
         return AWS_OP_ERR;
@@ -755,6 +750,133 @@ static int s_state_fn_unchunked_body(struct aws_h1_encoder *encoder, struct aws_
     return s_switch_state(encoder, AWS_H1_ENCODER_STATE_DONE);
 }
 
+/* Write out body (of unknown Content-Length) using chunked encoding.
+ * Each pass through this state writes out 1 chunk of body data (or nothing at all). */
+static int s_state_fn_chunked_body_stream(struct aws_h1_encoder *encoder, struct aws_byte_buf *dst) {
+
+    /* Each chunk is prefixed with: CHUNK-LENGTH-IN-ASCII-HEX CRLF
+     * and suffixed with: CRLF
+     *
+     * When reading from the stream, we don't know how much data we'll get,
+     * but the length needs to go in the prefix, before the data!
+     * Therefore, leave space at start of dst buffer for the prefix,
+     * we'll go back and write it AFTER streaming the body data.
+     * Leave space at the end for the suffix too.
+     *
+     * Use a predictable length for the prefix.
+     * 8 hex chars (i.e. "000000F7") seems reasonable (4 is too small, 16 is ridiculous, dynamic is complicated). */
+    enum { padded_hex_len = 8 }; /* enum, because it's used as size for stack array */
+    const char *padded_hex_fmt = "%08zX";
+    const size_t max_hex_value_given_padding = UINT32_MAX; /* fits in 8 chars */
+    const size_t chunk_prefix_len = padded_hex_len + CRLF_SIZE;
+    const size_t chunk_suffix_len = CRLF_SIZE;
+
+    /* If dst buffer nearly full, don't bother reading from stream.
+     * Remain in this state and we'll get a fresh buffer next tick. */
+    const size_t dont_bother_if_space_less_than = 128; /* magic number, seems reasonable */
+    AWS_ASSERT(dont_bother_if_space_less_than > chunk_prefix_len + chunk_suffix_len);
+    if (dst->capacity - dst->len < dont_bother_if_space_less_than) {
+        /* If this buffer is empty, and still not big enough, just give up.
+         * Probably never happens, but g_aws_channel_max_fragment_size can theoretically be tweaked by user. */
+        if (dst->len == 0) {
+            AWS_LOGF_ERROR(
+                AWS_LS_HTTP_STREAM, "id=%p Channel max fragment size is too small.", (void *)encoder->current_stream);
+            return aws_raise_error(AWS_ERROR_INVALID_STATE);
+        }
+
+        /* Remain in this state and we'll get a fresh buffer next tick */
+        return AWS_OP_SUCCESS;
+    }
+
+    /* Use a sub-buffer to limit where body can go.
+     * Body will go after chunk-prefix, and needs to leave enough space for chunk-suffix afterwards. */
+    uint8_t *body_sub_buf_start = dst->buffer + dst->len + chunk_prefix_len;
+    uint8_t *body_sub_buf_end = dst->buffer + dst->capacity - chunk_suffix_len;
+    struct aws_byte_buf body_sub_buf =
+        aws_byte_buf_from_empty_array(body_sub_buf_start, body_sub_buf_end - body_sub_buf_start);
+    /* We set aside a fixed number of bytes to encode the length, don't read more than that */
+    body_sub_buf.capacity = aws_min_size(body_sub_buf.capacity, max_hex_value_given_padding);
+
+    /* Stream body into sub-buffer */
+    ENCODER_LOG(TRACE, encoder, "Reading from body stream.");
+    if (aws_input_stream_read(encoder->message->body, &body_sub_buf) != AWS_OP_SUCCESS) {
+        ENCODER_LOGF(
+            ERROR,
+            encoder,
+            "Failed to read body stream, error %d (%s)",
+            aws_last_error(),
+            aws_error_name(aws_last_error()));
+        return AWS_OP_ERR;
+    }
+
+    /* If ANY body data was streamed, then write in chunk prefix and suffix.
+     *
+     * (else no body data streamed, so dst remains untouched. Maybe we've
+     * reached end of stream, maybe user just doesn't have data yet to send) */
+    if (body_sub_buf.len > 0) {
+        encoder->chunk_count++;
+        ENCODER_LOGF(
+            TRACE, encoder, "Sending chunk #%" PRIu64 " with size %zu", encoder->chunk_count, body_sub_buf.len);
+        bool wrote_all = true;
+
+        /* Write chunk-prefix: LENGTH-IN-HEX CRLF */
+        char hexbuf[padded_hex_len + 1] = {0};
+        AWS_ASSERT(body_sub_buf.len <= max_hex_value_given_padding); /* guaranteed, b/c we clamped .capacity earlier */
+        snprintf(hexbuf, sizeof(hexbuf), padded_hex_fmt, body_sub_buf.len);
+
+        wrote_all &= aws_byte_buf_write_from_whole_cursor(dst, aws_byte_cursor_from_c_str(hexbuf));
+        wrote_all &= s_write_crlf(dst);
+
+        /* Increment dst->len, since we already copied body in there via sub-buffer */
+        AWS_ASSERT(dst->buffer + dst->len == body_sub_buf_start); /* written chunk-prefix should end at body start */
+        dst->len += body_sub_buf.len; /* safe b/c we clamped body_sub_buf.capacity earlier */
+
+        /* Write chunk-suffix: CRLF */
+        wrote_all &= s_write_crlf(dst);
+
+        AWS_ASSERT(wrote_all); /* everything should have fit, we did a lot of math and clamping to guarantee it */
+        (void)wrote_all;
+    }
+
+    /* If body stream has ended: switch states.
+     * As an optimization, we only do this check when the stream didn't 100% fill the buffer */
+    if (body_sub_buf.len < body_sub_buf.capacity) {
+        struct aws_stream_status stream_status;
+        if (aws_input_stream_get_status(encoder->message->body, &stream_status) != AWS_OP_SUCCESS) {
+            ENCODER_LOGF(
+                ERROR,
+                encoder,
+                "Failed to query body stream status, error %d (%s)",
+                aws_last_error(),
+                aws_error_name(aws_last_error()));
+            return AWS_OP_ERR;
+        }
+
+        if (stream_status.is_end_of_stream) {
+            encoder->chunk_count++;
+            ENCODER_LOGF(TRACE, encoder, "Sending last chunk #%" PRIu64, encoder->chunk_count);
+            return s_switch_state(encoder, AWS_H1_ENCODER_STATE_CHUNKED_BODY_STREAM_LAST_CHUNK);
+        }
+    }
+
+    /* Remain in state until done streaming body */
+    return AWS_OP_SUCCESS;
+}
+
+/* Note: this state is ONLY used when streaming a body of unknown Content-Length.
+ * It is NOT used when the write_chunk() API is being used. */
+static int s_state_fn_chunked_body_stream_last_chunk(struct aws_h1_encoder *encoder, struct aws_byte_buf *dst) {
+
+    struct aws_byte_cursor last_chunk = AWS_BYTE_CUR_INIT_FROM_STRING_LITERAL("0\r\n");
+    if (aws_byte_buf_write_from_whole_cursor(dst, last_chunk) == true) {
+        ENCODER_LOG(TRACE, encoder, "Last chunk complete");
+        return s_switch_state(encoder, AWS_H1_ENCODER_STATE_CHUNK_TRAILER);
+    } else {
+        /* Remain in state until there's enough space to write */
+        return AWS_OP_SUCCESS;
+    }
+}
+
 /* Select next chunk to work on.
  * Encoder is essentially "paused" here if no chunks are available. */
 static int s_state_fn_chunk_next(struct aws_h1_encoder *encoder, struct aws_byte_buf *dst) {
@@ -773,7 +895,7 @@ static int s_state_fn_chunk_next(struct aws_h1_encoder *encoder, struct aws_byte
     ENCODER_LOGF(
         TRACE,
         encoder,
-        "Begin sending chunk %zu with size %" PRIu64,
+        "Begin sending chunk #%" PRIu64 " with size %" PRIu64,
         encoder->chunk_count,
         encoder->current_chunk->data_size);
 
@@ -871,7 +993,10 @@ struct encoder_state_def {
 static struct encoder_state_def s_encoder_states[] = {
     [AWS_H1_ENCODER_STATE_INIT] = {.fn = s_state_fn_init, .name = "INIT"},
     [AWS_H1_ENCODER_STATE_HEAD] = {.fn = s_state_fn_head, .name = "HEAD"},
-    [AWS_H1_ENCODER_STATE_UNCHUNKED_BODY] = {.fn = s_state_fn_unchunked_body, .name = "BODY"},
+    [AWS_H1_ENCODER_STATE_UNCHUNKED_BODY_STREAM] = {.fn = s_state_fn_unchunked_body_stream, .name = "BODY"},
+    [AWS_H1_ENCODER_STATE_CHUNKED_BODY_STREAM] = {.fn = s_state_fn_chunked_body_stream, .name = "CHUNKED_BODY_STREAM"},
+    [AWS_H1_ENCODER_STATE_CHUNKED_BODY_STREAM_LAST_CHUNK] =
+        {.fn = s_state_fn_chunked_body_stream_last_chunk, .name = "LAST_CHUNK"},
     [AWS_H1_ENCODER_STATE_CHUNK_NEXT] = {.fn = s_state_fn_chunk_next, .name = "CHUNK_NEXT"},
     [AWS_H1_ENCODER_STATE_CHUNK_LINE] = {.fn = s_state_fn_chunk_line, .name = "CHUNK_LINE"},
     [AWS_H1_ENCODER_STATE_CHUNK_BODY] = {.fn = s_state_fn_chunk_body, .name = "CHUNK_BODY"},
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -61,7 +61,6 @@ add_test_case(h1_encoder_transfer_encoding_chunked_across_multiple_headers)
 add_test_case(h1_encoder_case_insensitive_header_names)
 add_test_case(h1_encoder_rejects_transfer_encoding_chunked_header_combined_with_content_length)
 add_test_case(h1_encoder_rejects_transfer_encoding_header_without_chunked)
-add_test_case(h1_encoder_rejects_transfer_encoding_chunked_header_combined_with_body_stream)
 add_test_case(h1_encoder_transfer_encoding_chunked_header_with_multiple_encodings)
 add_test_case(h1_encoder_rejects_transfer_encoding_header_when_chunked_not_final_encoding)
 add_test_case(h1_encoder_rejects_transfer_encoding_header_not_ending_in_chunked)
@@ -76,13 +75,17 @@ add_test_case(h1_client_sanity_check)
 add_test_case(h1_client_request_send_1liner)
 add_test_case(h1_client_request_send_headers)
 add_test_case(h1_client_request_send_body)
+add_test_case(h1_client_request_send_body_chunked_and_streaming)
 add_test_case(h1_client_request_send_body_chunked)
 add_test_case(h1_client_request_send_chunked_trailer)
 add_test_case(h1_client_request_forbidden_trailer)
 add_test_case(h1_client_request_send_empty_chunked_trailer)
 add_test_case(h1_client_request_send_large_body)
 add_test_case(h1_client_request_send_large_body_chunked)
+add_test_case(h1_client_request_send_large_body_chunked_and_streaming)
 add_test_case(h1_client_request_send_large_head)
+add_test_case(h1_client_request_send_empty_body_chunked_and_streaming)
+add_test_case(h1_client_request_send_stalled_body_chunked_and_streaming)
 add_test_case(h1_client_request_content_length_0_ok)
 add_test_case(h1_client_request_waits_for_chunks)
 add_test_case(h1_client_request_send_chunk_from_chunk_complete_callback)
diff --git a/tests/test_h1_client.c b/tests/test_h1_client.c
diff --git a/tests/test_h1_encoder.c b/tests/test_h1_encoder.c