From c9ad5e86caff7793b2872db19fdf40782736198d Mon Sep 17 00:00:00 2001 From: Sunshine Date: Fri, 22 Oct 2021 15:00:47 -1000 Subject: [PATCH] add proper support for custom charsets --- Cargo.lock | 6 +- Cargo.toml | 2 +- README.md | 19 ++++--- src/lib.rs | 131 +++++++++++++++++++++++++++++++++----------- src/main.rs | 67 +++++++++++++++++----- tests/cli/basic.rs | 23 +++++++- tests/cli/decode.rs | 2 +- tests/cli/encode.rs | 34 ++++++++---- tests/lib/parse.rs | 2 +- 9 files changed, 208 insertions(+), 78 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c7ed393..349c238 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -80,7 +80,7 @@ dependencies = [ [[package]] name = "dataurl" -version = "0.1.0" +version = "0.1.1" dependencies = [ "assert_cmd", "base64", @@ -227,9 +227,9 @@ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" [[package]] name = "termtree" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78fbf2dd23e79c28ccfa2472d3e6b3b189866ffef1aeb91f17c2d968b6586378" +checksum = "76565a2f8df1d2170b5c365aa39d0623fd93fec20545edde299233cea82d0f16" [[package]] name = "textwrap" diff --git a/Cargo.toml b/Cargo.toml index c21a34a..9952c6e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "dataurl" -version = "0.1.0" +version = "0.1.1" authors = [ "Sunshine ", ] diff --git a/README.md b/README.md index de50e97..aea5789 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ # dataurl -CLI tool and Rust crate for converting files into data URLs and back +CLI tool / Rust crate for converting files and text into data URLs and back --------------------------------------------------- @@ -68,9 +68,9 @@ assert_eq!(data_url.get_text(), "Hello, World!"); ```console dataurl "some text" ``` - +val#f' > index.html ```console -dataurl -d 'data:text/html,text...

bottom?arg=val#f' > index.html +dataurl -d 'data:text/html,textok?a=v#f' > index.html ``` ```console @@ -88,14 +88,15 @@ cat file.png | dataurl --------------------------------------------------- -## Options +## Flags and options - - `-b`: Prefer to use base64 even when not necessary - - `-c`: Use custom `charset` (automatically sets `-b` if not `US-ASCII` or `windows-1252`) + - `-b`: Encode data using base64 - `-d`: Attempt to parse input, output resulting data - - `-f`: Append custom `fragment` - - `-i`: Path to `file` to treat as input (use `-` for STDIN) - - `-t`: Adjust `media type` + + - `-c`: Use custom `charset` + - `-f`: Append `fragment` + - `-i`: Specify `file` to read data from (use `-` for STDIN) + - `-m`: Adjust `media type` --------------------------------------------------- diff --git a/src/lib.rs b/src/lib.rs index 619273d..661812b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,5 @@ use encoding_rs::Encoding; -use percent_encoding::{percent_decode_str, utf8_percent_encode, NON_ALPHANUMERIC}; +use percent_encoding::{percent_decode_str, percent_encode, utf8_percent_encode, NON_ALPHANUMERIC}; use std::fmt; use url::Url; @@ -8,10 +8,10 @@ const DEFAULT_CHARSET: &'static str = "US-ASCII"; // TODO: add support for other optional parameters besides charset (filename, etc) pub struct DataUrl { - media_type: Option, // Mime type + media_type: Option, // Media type charset: Option, // US-ASCII is default, according to the spec is_base64_encoded: bool, // Indicates if it's a base64-encoded data URL - data: Vec, // Data, bytes + data: Vec, // Data, bytes, always UTF-8 fragment: Option, // #something-at-the-end, None by default } @@ -27,12 +27,14 @@ impl fmt::Debug for DataUrlParseError { } } -fn validate_media_type(input: &str) -> bool { +pub(crate) fn validate_media_type(input: &str) -> bool { // Must contain one slash input.split('/').collect::>().len() == 2 } -fn parse_data_url_meta_data(meta_data_string: String) -> (Option, Option, bool) { +pub(crate) fn parse_data_url_meta_data( + meta_data_string: String, +) -> (Option, Option, bool) { let mut media_type: Option = None; let mut charset: Option = None; let mut is_base64_encoded: bool = false; @@ -182,20 +184,59 @@ impl DataUrl { } pub fn set_charset(&mut self, new_charset: Option) -> bool { - if let Some(c) = new_charset { + let c: Option; + let success; + + if let Some(nc) = new_charset { // Validate the input - if let Some(e) = Encoding::for_label_no_replacement(c.as_bytes()) { - self.charset = Some(e.name().to_string()); - true + if let Some(e) = Encoding::for_label_no_replacement(nc.as_bytes()) { + c = Some(e.name().to_string()); + success = true; } else { - // Since browsers fall back to US-ASCII, so do we - self.charset = None; - false + // Since browsers fall back to US-ASCII, so does this + c = None; + success = false; } } else { - self.charset = None; - true + // Unset + c = None; + success = true; } + + /* + // Check if already has the same charset + if self.charset != c { + if self.data.len() > 0 { + // Re-encode existing data from old charset into new one + // Can be lossy if not careful + + // 1. Decode our current data into UTF-8 (if needed) + if self.charset.is_some() + && self.charset != Some("US-ASCII".to_string()) + && self.charset != Some("windows-1252".to_string()) + && self.charset != Some("UTF-8".to_string()) + { + if let Some(encoding) = Encoding::for_label_no_replacement(self.charset.as_ref().unwrap().as_bytes()) { + let (decoded, _, _) = encoding.decode(&self.data); + self.data = decoded.as_bytes().to_vec(); + } + } + + // 2. Encode our UTF-8 data into whatever encoding it's now set to have + if let Some(encoding) = Encoding::for_label_no_replacement(c.clone().unwrap().as_bytes()) { + let input = &String::from_utf8_lossy(&self.data); + let (encoded, _, _) = encoding.encode(input); + self.data = encoded.to_vec(); + } + } + + self.charset = c; + } + */ + + self.charset = c; + + success } // TODO: ditch get/set_is_base64_encode and implement two separate functions, to_precent_encoded_string, and to_base64_encoded_string? @@ -215,7 +256,7 @@ impl DataUrl { pub fn get_text(&self) -> String { // This can never really fail - if let Some(encoding) = Encoding::for_label( + if let Some(encoding) = Encoding::for_label_no_replacement( self.charset .as_ref() .unwrap_or(&DEFAULT_CHARSET.to_string()) @@ -228,9 +269,24 @@ impl DataUrl { } } - // TODO - // pub fn set_text(&self, Option) { - // } + /* + // TODO: add new_text_charset argument? + pub fn set_text(&mut self, new_text: &str) { + if self.charset == Some("UTF-8".to_string()) { + self.data = new_text.as_bytes().to_vec(); + } else { + if let Some(encoding) = Encoding::for_label_no_replacement( + self.charset + .as_ref() + .unwrap_or(&DEFAULT_CHARSET.to_string()) + .as_bytes(), + ) { + let (decoded, _, _) = encoding.decode(&new_text.as_bytes()); + self.data = decoded.as_bytes().to_vec(); + } + } + } + */ pub fn set_data(&mut self, new_data: &[u8]) { self.data = new_data.to_vec(); @@ -251,7 +307,7 @@ impl DataUrl { // TODO: rename it to as_str/to_str, make it return a &str instead of String // TODO: make it an Option(Result?), throw error in case is_base64_encoded=false, and charset!=default|utf8 pub fn to_string(&self) -> String { - let mut result: String = "data:".to_string(); + let mut result: String = String::from("data:"); if let Some(mt) = &self.media_type { result += &mt; @@ -265,31 +321,40 @@ impl DataUrl { } } - if self.is_base64_encoded { - result += ";base64,"; + { + if self.is_base64_encoded { + result += ";base64"; + } + result += ","; + if self.data.len() > 0 { - // This can never fail - if let Some(encoding) = Encoding::for_label( + let data_as_utf8_string: String = String::from_utf8_lossy(&self.data).to_string(); + let fallback_charset: String = if data_as_utf8_string.is_ascii() { + DEFAULT_CHARSET.to_string() + } else { + "UTF-8".to_string() + }; + + if let Some(encoding) = Encoding::for_label_no_replacement( self.charset .as_ref() - .unwrap_or(&DEFAULT_CHARSET.to_string()) + .unwrap_or(&fallback_charset) .as_bytes(), ) { - let (decoded, _, _) = encoding.decode(&self.data); - result += &base64::encode(&decoded.as_bytes()); + let (encoded, _, _) = encoding.encode(&data_as_utf8_string); + + if self.is_base64_encoded { + result += &base64::encode(&encoded.to_vec()); + } else { + result += &percent_encode(&encoded.to_vec(), NON_ALPHANUMERIC).to_string(); + } } } - } else { - result += ","; - if self.data.len() > 0 { - result += - &utf8_percent_encode(&String::from_utf8_lossy(&self.data), NON_ALPHANUMERIC) - .to_string(); - } } if let Some(f) = &self.fragment { result += "#"; + // TODO: need to deal with encoding here as well result += &utf8_percent_encode(f, NON_ALPHANUMERIC).to_string(); } diff --git a/src/main.rs b/src/main.rs index 9000090..485577e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -50,7 +50,7 @@ fn main() { ) .arg( Arg::with_name("media_type") - .short("t") + .short("m") .long("media-type") .multiple(false) .takes_value(true) @@ -59,65 +59,102 @@ fn main() { .arg(Arg::with_name("INPUT").help("Input string").required(false)) .get_matches(); + ////////////////////////////////////////////////////////////////////////// + let is_in_decode_mode: bool = app.is_present("decode"); - let input: Vec = if app.is_present("INPUT") { + let has_arg_input: bool = app.is_present("INPUT"); + let mut has_file_input: bool = app.is_present("FILE"); + let input_file_path: &str = if has_file_input { + app.value_of("FILE").unwrap() + } else { + "" + }; + if has_file_input && input_file_path == "-" { + has_file_input = false; + } + + ////////////////////////////////////////////////////////////////////////// + + if has_arg_input && has_file_input { + eprintln!("error: Both file and argument inputs provided"); + std::process::exit(1); + } + + ////////////////////////////////////////////////////////////////////////// + + let input: Vec = if has_arg_input { app.value_of("INPUT").unwrap().as_bytes().to_vec() - } else if app.is_present("FILE") { - match fs::read(app.value_of("FILE").unwrap()) { - Ok(f) => f, + } else if has_file_input { + match fs::read(input_file_path) { + Ok(input_file_data) => input_file_data, Err(_) => { - eprintln!( - "Error: unable to read input file {}.", - app.value_of("FILE").unwrap() - ); + eprintln!("error: Unable to read input file '{}'", input_file_path); std::process::exit(1); } } } else { - eprintln!("Error: no input provided."); + eprintln!("error: No input provided"); vec![] }; + ////////////////////////////////////////////////////////////////////////// + if is_in_decode_mode { - let input_as_string: &str = std::str::from_utf8(&input).unwrap(); - std::process::exit(match DataUrl::parse(input_as_string) { + // TODO: ideally the program needs to check the current terminal locale (encoding), and not just assume it's UTF-8 + let input_as_string: String = String::from_utf8_lossy(&input).to_string(); + + std::process::exit(match DataUrl::parse(&input_as_string) { Ok(data_url) => { println!("{}", data_url.get_text()); 0 } Err(err) => { - eprintln!("Error: {:?}.", err); + eprintln!("error: {:?}", err); 1 } }); } else { let mut data_url = DataUrl::new(); + data_url.set_data(&input); + if app.is_present("base64") { data_url.set_is_base64_encoded(true); } + if app.is_present("charset") { let charset: &str = app.value_of("charset").unwrap(); let success: bool = data_url.set_charset(Some(charset.to_string())); if !success { - eprintln!("Error: invalid charset {}.", charset); + eprintln!("error: Invalid charset '{}'", charset); std::process::exit(1); } + } else { + // TODO: ideally the program needs to check the current terminal locale (encoding), and not just assume it's UTF-8 + + // Automatically enforce ;charset=UTF-8 for non-ascii argument inputs + if has_arg_input && !String::from_utf8_lossy(&input).to_string().is_ascii() { + data_url.set_charset(Some("UTF-8".to_string())); + } } + if app.is_present("media_type") { let media_type: &str = app.value_of("media_type").unwrap(); let success: bool = data_url.set_media_type(Some(media_type.to_string())); if !success { - eprintln!("Error: invalid media type {}.", media_type); + eprintln!("error: Invalid media type '{}'", media_type); std::process::exit(1); } } + if app.is_present("fragment") { data_url.set_fragment(Some(app.value_of("fragment").unwrap().to_string())); } + println!("{}", data_url.to_string()); + std::process::exit(0); } } diff --git a/tests/cli/basic.rs b/tests/cli/basic.rs index 0673bf9..f33b249 100644 --- a/tests/cli/basic.rs +++ b/tests/cli/basic.rs @@ -34,7 +34,7 @@ OPTIONS: -i, --input-file Provides input file -c, --charset Sets custom charset -f, --fragment Appends URL fragment - -t, --media-type Sets custom media type + -m, --media-type Sets custom media type ARGS: Input string @@ -99,12 +99,29 @@ mod failing { USAGE: {bin}{exe} [FLAGS] [OPTIONS] [INPUT] -For more information try --help -", +For more information try --help\n", bin = env!("CARGO_PKG_NAME"), exe = if cfg!(windows) { ".exe" } else { "" } )) // STDOUT must contain absolutely nothing .stdout(""); } + + #[test] + fn must_fail_when_both_file_and_argument_input_given() { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + let assert = cmd + .arg("-i") + .arg("_data_/text-file.txt") + .arg("text") + .assert(); + + assert + // Exit code must be 1 + .failure() + // STDERR must contain error message + .stderr("error: Both file and argument inputs provided\n") + // STDOUT must contain absolutely nothing + .stdout(""); + } } diff --git a/tests/cli/decode.rs b/tests/cli/decode.rs index c82202b..da48cba 100644 --- a/tests/cli/decode.rs +++ b/tests/cli/decode.rs @@ -64,7 +64,7 @@ mod failing { // Exit code must be 1 .failure() // STDERR must contain error message - .stderr("Error: DataUrlParseError.\n") + .stderr("error: DataUrlParseError\n") // STDOUT must be empty .stdout(""); } diff --git a/tests/cli/encode.rs b/tests/cli/encode.rs index be3c5f2..8201064 100644 --- a/tests/cli/encode.rs +++ b/tests/cli/encode.rs @@ -70,7 +70,7 @@ mod passing { #[test] fn must_support_setting_media_type() { let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); - let assert = cmd.arg("-b").arg(" ").arg("-t").arg("text/html").assert(); + let assert = cmd.arg("-b").arg("-m").arg("text/html").arg(" ").assert(); assert // Exit code must be 0 @@ -84,7 +84,7 @@ mod passing { #[test] fn must_support_setting_charset() { let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); - let assert = cmd.arg("-b").arg(" ").arg("-c").arg("utf8").assert(); + let assert = cmd.arg("-b").arg("-c").arg("utf8").arg(" ").assert(); assert // Exit code must be 0 @@ -98,7 +98,7 @@ mod passing { #[test] fn must_set_fragment_if_provided() { let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); - let assert = cmd.arg("-b").arg(" ").arg("-f").arg("something").assert(); + let assert = cmd.arg("-b").arg("-f").arg("something").arg(" ").assert(); assert // Exit code must be 0 @@ -112,7 +112,7 @@ mod passing { #[test] fn must_set_empty_fragment_if_provided() { let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); - let assert = cmd.arg("-b").arg(" ").arg("-f").arg("").assert(); + let assert = cmd.arg("-b").arg("-f").arg("").arg(" ").assert(); assert // Exit code must be 0 @@ -122,6 +122,20 @@ mod passing { // STDOUT must contain generated data URL .stdout("data:;base64,IA==#\n"); } + + #[test] + fn must_support_gbk_encoded_data_urls() { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + let assert = cmd.arg("-c").arg("gbk").arg("Ü").assert(); + + assert + // Exit code must be 0 + .success() + // STDERR must be completely empty + .stderr("") + // STDOUT must contain properly encoded data URL + .stdout("data:;charset=GBK,%26%23220%3B\n"); + } } // ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ @@ -139,17 +153,13 @@ mod failing { #[test] fn must_not_allow_incorrect_media_type_to_be_set() { let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); - let assert = cmd - .arg("-t") - .arg("wrong/media/type") - .arg("something") - .assert(); + let assert = cmd.arg("-m").arg("wrong/media/type").arg("Ü").assert(); assert // Exit code must be 1 .failure() // STDERR must contain error message - .stderr("Error: invalid media type wrong/media/type.\n") + .stderr("error: Invalid media type 'wrong/media/type'\n") // STDOUT must be empty .stdout(""); } @@ -157,13 +167,13 @@ mod failing { #[test] fn must_not_allow_incorrect_charset_to_be_set() { let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); - let assert = cmd.arg("-c").arg("BAD-CHARSET").arg("something").assert(); + let assert = cmd.arg("-c").arg("BAD-CHARSET").arg("Ü").assert(); assert // Exit code must be 1 .failure() // STDERR must contain error message - .stderr("Error: invalid charset BAD-CHARSET.\n") + .stderr("error: Invalid charset 'BAD-CHARSET'\n") // STDOUT must be empty .stdout(""); } diff --git a/tests/lib/parse.rs b/tests/lib/parse.rs index a489ad2..5e3ac05 100644 --- a/tests/lib/parse.rs +++ b/tests/lib/parse.rs @@ -234,7 +234,7 @@ mod failing { assert_eq!(data_url.get_fragment(), None); assert_eq!(data_url.get_text(), "Ãœ"); // Different from the original her because we needed to encode "Ãœ" as US-ASCII, which is 4 bytes - assert_eq!(data_url.to_string(), "data:text/css;base64,w4PFkw=="); + assert_eq!(data_url.to_string(), "data:text/css;base64,w5w="); data_url.set_charset(Some("utf-8".to_string())); // And now it should be fine assert_eq!(