feat(headers): add extended parameter parser to the public API

malept · malept · commit 402fb76bb2f3 · 2015-12-15T09:56:09.000-08:00
Move the extended parameter parser from the Content-Disposition header
implementation into the common header parsing module. This allows crates that
use Hyper to parse RFC 5987-compliant header parameter values.
Add tests based on the examples given in the RFC.
diff --git a/src/header/common/content_disposition.rs b/src/header/common/content_disposition.rs
@@ -8,11 +8,11 @@
 
 use language_tags::LanguageTag;
 use std::fmt;
-use std::str::FromStr;
 use unicase::UniCase;
 use url::percent_encoding;
 
 use header::{Header, HeaderFormat, parsing};
+use header::parsing::parse_extended_value;
 use header::shared::Charset;
 
 /// The implied disposition of the content of the HTTP body
@@ -133,8 +133,8 @@ impl Header for ContentDisposition {
                             Charset::Ext("UTF-8".to_owned()), None,
                             val.trim_matches('"').as_bytes().to_owned())
                     } else if UniCase(&*key) == UniCase("filename*") {
-                        let (charset, opt_language, value) = try!(parse_ext_value(val));
-                        DispositionParam::Filename(charset, opt_language, value)
+                        let extended_value = try!(parse_extended_value(val));
+                        DispositionParam::Filename(extended_value.charset, extended_value.language_tag, extended_value.value)
                     } else {
                         DispositionParam::Ext(key.to_owned(), val.trim_matches('"').to_owned())
                     }
@@ -195,68 +195,6 @@ impl fmt::Display for ContentDisposition {
     }
 }
 
-/// Parsing of `ext-value`
-/// https://tools.ietf.org/html/rfc5987#section-3.2
-///
-/// # ABNF
-/// ```plain
-/// ext-value     = charset  "'" [ language ] "'" value-chars
-///               ; like RFC 2231's <extended-initial-value>
-///               ; (see [RFC2231], Section 7)
-///
-/// charset       = "UTF-8" / "ISO-8859-1" / mime-charset
-///
-/// mime-charset  = 1*mime-charsetc
-/// mime-charsetc = ALPHA / DIGIT
-///               / "!" / "#" / "$" / "%" / "&"
-///               / "+" / "-" / "^" / "_" / "`"
-///               / "{" / "}" / "~"
-///               ; as <mime-charset> in Section 2.3 of [RFC2978]
-///               ; except that the single quote is not included
-///               ; SHOULD be registered in the IANA charset registry
-///
-/// language      = <Language-Tag, defined in [RFC5646], Section 2.1>
-///
-/// value-chars   = *( pct-encoded / attr-char )
-///
-/// pct-encoded   = "%" HEXDIG HEXDIG
-///               ; see [RFC3986], Section 2.1
-///
-/// attr-char     = ALPHA / DIGIT
-///               / "!" / "#" / "$" / "&" / "+" / "-" / "."
-///               / "^" / "_" / "`" / "|" / "~"
-///               ; token except ( "*" / "'" / "%" )
-/// ```
-fn parse_ext_value(val: &str) -> ::Result<(Charset, Option<LanguageTag>, Vec<u8>)> {
-
-    // Break into three pieces separated by the single-quote character
-    let mut parts = val.splitn(3,'\'');
-
-    // Interpret the first piece as a Charset
-    let charset: Charset = match parts.next() {
-        None => return Err(::Error::Header),
-        Some(n) => try!(FromStr::from_str(n)),
-    };
-
-    // Interpret the second piece as a language tag
-    let lang: Option<LanguageTag> = match parts.next() {
-        None => return Err(::Error::Header),
-        Some("") => None,
-        Some(s) => match s.parse() {
-            Ok(lt) => Some(lt),
-            Err(_) => return Err(::Error::Header),
-        }
-    };
-
-    // Interpret the third piece as a sequence of value characters
-    let value: Vec<u8> = match parts.next() {
-        None => return Err(::Error::Header),
-        Some(v) => percent_encoding::percent_decode(v.as_bytes()),
-    };
-
-    Ok( (charset, lang, value) )
-}
-
 #[cfg(test)]
 mod tests {
     use super::{ContentDisposition,DispositionType,DispositionParam};
diff --git a/src/header/parsing.rs b/src/header/parsing.rs
@@ -1,7 +1,12 @@
 //! Utility functions for Header implementations.
 
+use language_tags::LanguageTag;
 use std::str;
+use std::str::FromStr;
 use std::fmt::{self, Display};
+use url::percent_encoding;
+
+use header::shared::Charset;
 
 /// Reads a single raw string when parsing a header.
 pub fn from_one_raw_str<T: str::FromStr>(raw: &[Vec<u8>]) -> ::Result<T> {
@@ -48,3 +53,131 @@ pub fn fmt_comma_delimited<T: Display>(f: &mut fmt::Formatter, parts: &[T]) -> f
     }
     Ok(())
 }
+
+/// An extended header parameter value (i.e., tagged with a character set and optionally,
+/// a language), as defined in [RFC 5987](https://tools.ietf.org/html/rfc5987#section-3.2).
+pub struct ExtendedValue {
+    pub charset: Charset,
+    pub language_tag: Option<LanguageTag>,
+    pub value: Vec<u8>,
+}
+
+/// Parses extended header parameter values (`ext-value`), as defined in
+/// [RFC 5987](https://tools.ietf.org/html/rfc5987#section-3.2).
+///
+/// Extended values are denoted by parameter names that end with `*`.
+///
+/// ## ABNF
+/// ```plain
+/// ext-value     = charset  "'" [ language ] "'" value-chars
+///               ; like RFC 2231's <extended-initial-value>
+///               ; (see [RFC2231], Section 7)
+///
+/// charset       = "UTF-8" / "ISO-8859-1" / mime-charset
+///
+/// mime-charset  = 1*mime-charsetc
+/// mime-charsetc = ALPHA / DIGIT
+///               / "!" / "#" / "$" / "%" / "&"
+///               / "+" / "-" / "^" / "_" / "`"
+///               / "{" / "}" / "~"
+///               ; as <mime-charset> in Section 2.3 of [RFC2978]
+///               ; except that the single quote is not included
+///               ; SHOULD be registered in the IANA charset registry
+///
+/// language      = <Language-Tag, defined in [RFC5646], Section 2.1>
+///
+/// value-chars   = *( pct-encoded / attr-char )
+///
+/// pct-encoded   = "%" HEXDIG HEXDIG
+///               ; see [RFC3986], Section 2.1
+///
+/// attr-char     = ALPHA / DIGIT
+///               / "!" / "#" / "$" / "&" / "+" / "-" / "."
+///               / "^" / "_" / "`" / "|" / "~"
+///               ; token except ( "*" / "'" / "%" )
+/// ```
+pub fn parse_extended_value(val: &str) -> ::Result<ExtendedValue> {
+
+    // Break into three pieces separated by the single-quote character
+    let mut parts = val.splitn(3,'\'');
+
+    // Interpret the first piece as a Charset
+    let charset: Charset = match parts.next() {
+        None => return Err(::Error::Header),
+        Some(n) => try!(FromStr::from_str(n)),
+    };
+
+    // Interpret the second piece as a language tag
+    let lang: Option<LanguageTag> = match parts.next() {
+        None => return Err(::Error::Header),
+        Some("") => None,
+        Some(s) => match s.parse() {
+            Ok(lt) => Some(lt),
+            Err(_) => return Err(::Error::Header),
+        }
+    };
+
+    // Interpret the third piece as a sequence of value characters
+    let value: Vec<u8> = match parts.next() {
+        None => return Err(::Error::Header),
+        Some(v) => percent_encoding::percent_decode(v.as_bytes()),
+    };
+
+    Ok(ExtendedValue {
+        charset: charset,
+        language_tag: lang,
+        value: value,
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use header::shared::Charset;
+    use super::parse_extended_value;
+
+    #[test]
+    fn test_parse_extended_value_with_encoding_and_language_tag() {
+        let expected_language_tag = langtag!(en);
+        // RFC 5987, Section 3.2.2
+        // Extended notation, using the Unicode character U+00A3 (POUND SIGN)
+        let result = parse_extended_value("iso-8859-1'en'%A3%20rates");
+        assert!(result.is_ok());
+        let extended_value = result.unwrap();
+        assert_eq!(Charset::Iso_8859_1, extended_value.charset);
+        assert!(extended_value.language_tag.is_some());
+        assert_eq!(expected_language_tag, extended_value.language_tag.unwrap());
+        assert_eq!(vec![163, b' ', b'r', b'a', b't', b'e', b's'], extended_value.value);
+    }
+
+    #[test]
+    fn test_parse_extended_value_with_encoding() {
+        // RFC 5987, Section 3.2.2
+        // Extended notation, using the Unicode characters U+00A3 (POUND SIGN)
+        // and U+20AC (EURO SIGN)
+        let result = parse_extended_value("UTF-8''%c2%a3%20and%20%e2%82%ac%20rates");
+        assert!(result.is_ok());
+        let extended_value = result.unwrap();
+        assert_eq!(Charset::Ext("UTF-8".to_string()), extended_value.charset);
+        assert!(extended_value.language_tag.is_none());
+        assert_eq!(vec![194, 163, b' ', b'a', b'n', b'd', b' ', 226, 130, 172, b' ', b'r', b'a', b't', b'e', b's'], extended_value.value);
+    }
+
+    #[test]
+    fn test_parse_extended_value_missing_language_tag_and_encoding() {
+        // From: https://greenbytes.de/tech/tc2231/#attwithfn2231quot2
+        let result = parse_extended_value("foo%20bar.html");
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_parse_extended_value_partially_formatted() {
+        let result = parse_extended_value("UTF-8'missing third part");
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_parse_extended_value_partially_formatted_blank() {
+        let result = parse_extended_value("blank second part'");
+        assert!(result.is_err());
+    }
+}