From a131b430a0e2e227c8771212dc5f469cd08e5dce Mon Sep 17 00:00:00 2001 From: Kevin Cantu Date: Sat, 11 Feb 2012 01:57:39 -0800 Subject: [PATCH 1/6] core::str rename [r]index -> [r]index_bytes --- src/cargo/cargo.rs | 4 ++-- src/comp/back/link.rs | 2 +- src/comp/syntax/codemap.rs | 2 +- src/fuzzer/fuzzer.rs | 2 +- src/libcore/str.rs | 22 ++++++++++++---------- src/libstd/fs.rs | 8 ++++---- src/libstd/getopts.rs | 2 +- 7 files changed, 22 insertions(+), 20 deletions(-) diff --git a/src/cargo/cargo.rs b/src/cargo/cargo.rs index 8536bf7d830ab..aa042ea1ae468 100644 --- a/src/cargo/cargo.rs +++ b/src/cargo/cargo.rs @@ -651,7 +651,7 @@ fn cmd_install(c: cargo) unsafe { if str::starts_with(target, "uuid:") { let uuid = rest(target, 5u); - let idx = str::index(uuid, '/' as u8); + let idx = str::index_byte(uuid, '/' as u8); if idx != -1 { let source = str::unsafe::slice_bytes(uuid, 0u, idx as uint); uuid = str::unsafe::slice_bytes(uuid, idx as uint + 1u, @@ -662,7 +662,7 @@ fn cmd_install(c: cargo) unsafe { } } else { let name = target; - let idx = str::index(name, '/' as u8); + let idx = str::index_byte(name, '/' as u8); if idx != -1 { let source = str::unsafe::slice_bytes(name, 0u, idx as uint); name = str::unsafe::slice_bytes(name, idx as uint + 1u, diff --git a/src/comp/back/link.rs b/src/comp/back/link.rs index cb2070c124c5a..4f32ad4a31fc7 100644 --- a/src/comp/back/link.rs +++ b/src/comp/back/link.rs @@ -109,7 +109,7 @@ mod write { // Decides what to call an intermediate file, given the name of the output // and the extension to use. fn mk_intermediate_name(output_path: str, extension: str) -> str unsafe { - let dot_pos = str::index(output_path, '.' as u8); + let dot_pos = str::index_byte(output_path, '.' as u8); let stem; if dot_pos < 0 { stem = output_path; diff --git a/src/comp/syntax/codemap.rs b/src/comp/syntax/codemap.rs index 5d8d7ff563332..27f968b1156f7 100644 --- a/src/comp/syntax/codemap.rs +++ b/src/comp/syntax/codemap.rs @@ -125,7 +125,7 @@ fn get_line(fm: filemap, line: int) -> str unsafe { // the remainder of the file, which is undesirable. end = str::byte_len(*fm.src); let rest = str::unsafe::slice_bytes(*fm.src, begin, end); - let newline = str::index(rest, '\n' as u8); + let newline = str::index_byte(rest, '\n' as u8); if newline != -1 { end = begin + (newline as uint); } } ret str::unsafe::slice_bytes(*fm.src, begin, end); diff --git a/src/fuzzer/fuzzer.rs b/src/fuzzer/fuzzer.rs index a5cfb8db3f7cf..e3f452f687040 100644 --- a/src/fuzzer/fuzzer.rs +++ b/src/fuzzer/fuzzer.rs @@ -284,7 +284,7 @@ fn check_variants_T( } fn last_part(filename: str) -> str unsafe { - let ix = str::rindex(filename, 47u8 /* '/' */); + let ix = str::rindex_byte(filename, 47u8 /* '/' */); assert ix >= 0; str::unsafe::slice_bytes(filename, ix as uint + 1u, str::byte_len(filename) - 3u) } diff --git a/src/libcore/str.rs b/src/libcore/str.rs index b98d96bb901a0..bc96475990f99 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -70,8 +70,10 @@ export lines_iter, // Searching - index, - rindex, + //index, + //rindex, + index_byte, + rindex_byte, find, contains, starts_with, @@ -876,7 +878,7 @@ no match is found. FIXME: UTF-8 */ -fn index(s: str, c: u8) -> int { +fn index_byte(s: str, c: u8) -> int { let i: int = 0; for k: u8 in s { if k == c { ret i; } i += 1; } ret -1; @@ -890,7 +892,7 @@ if no match is found. FIXME: UTF-8 */ -fn rindex(s: str, c: u8) -> int { +fn rindex_byte(s: str, c: u8) -> int { let n: int = byte_len(s) as int; while n >= 0 { if s[n] == c { ret n; } n -= 1; } ret n; @@ -1443,12 +1445,12 @@ mod tests { #[test] fn test_index_and_rindex() { - assert (index("hello", 'e' as u8) == 1); - assert (index("hello", 'o' as u8) == 4); - assert (index("hello", 'z' as u8) == -1); - assert (rindex("hello", 'l' as u8) == 3); - assert (rindex("hello", 'h' as u8) == 0); - assert (rindex("hello", 'z' as u8) == -1); + assert (index_byte("hello", 'e' as u8) == 1); + assert (index_byte("hello", 'o' as u8) == 4); + assert (index_byte("hello", 'z' as u8) == -1); + assert (rindex_byte("hello", 'l' as u8) == 3); + assert (rindex_byte("hello", 'h' as u8) == 0); + assert (rindex_byte("hello", 'z' as u8) == -1); } #[test] diff --git a/src/libstd/fs.rs b/src/libstd/fs.rs index 2304445b95b18..874d92cc4b9d4 100644 --- a/src/libstd/fs.rs +++ b/src/libstd/fs.rs @@ -44,9 +44,9 @@ The dirname of "/usr/share" will be "/usr", but the dirname of If the path is not prefixed with a directory, then "." is returned. */ fn dirname(p: path) -> path unsafe { - let i: int = str::rindex(p, os_fs::path_sep as u8); + let i: int = str::rindex_byte(p, os_fs::path_sep as u8); if i == -1 { - i = str::rindex(p, os_fs::alt_path_sep as u8); + i = str::rindex_byte(p, os_fs::alt_path_sep as u8); if i == -1 { ret "."; } } ret str::unsafe::slice_bytes(p, 0u, i as uint); @@ -64,9 +64,9 @@ the provided path. If an empty path is provided or the path ends with a path separator then an empty path is returned. */ fn basename(p: path) -> path unsafe { - let i: int = str::rindex(p, os_fs::path_sep as u8); + let i: int = str::rindex_byte(p, os_fs::path_sep as u8); if i == -1 { - i = str::rindex(p, os_fs::alt_path_sep as u8); + i = str::rindex_byte(p, os_fs::alt_path_sep as u8); if i == -1 { ret p; } } let len = str::byte_len(p); diff --git a/src/libstd/getopts.rs b/src/libstd/getopts.rs index 48d83f0e5c2c8..33674fe67f38d 100644 --- a/src/libstd/getopts.rs +++ b/src/libstd/getopts.rs @@ -230,7 +230,7 @@ fn getopts(args: [str], opts: [opt]) -> result unsafe { let i_arg = option::none::; if cur[1] == '-' as u8 { let tail = str::unsafe::slice_bytes(cur, 2u, curlen); - let eq = str::index(tail, '=' as u8); + let eq = str::index_byte(tail, '=' as u8); if eq == -1 { names = [long(tail)]; } else { From 14baf88f89241f1384e4d12b4751910fe16c947c Mon Sep 17 00:00:00 2001 From: Kevin Cantu Date: Sat, 11 Feb 2012 03:04:12 -0800 Subject: [PATCH 2/6] core::str: added index (char) --- src/libcore/str.rs | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/libcore/str.rs b/src/libcore/str.rs index bc96475990f99..06a49b8d33992 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -870,6 +870,30 @@ fn lines_iter(ss: str, ff: fn(&&str)) { Section: Searching */ +// Function: index +// +// Returns the index of the first matching char +// (as option some/none) +fn index(ss: str, cc: char) -> option { + let bii = 0u; + let cii = 0u; + let len = byte_len(ss); + while bii < len { + let {ch, next} = char_range_at(ss, bii); + + // found here? + if ch == cc { + ret option::some(cii); + } + + cii += 1u; + bii = next; + } + + // wasn't found + ret option::none; +} + /* Function: index @@ -1448,6 +1472,9 @@ mod tests { assert (index_byte("hello", 'e' as u8) == 1); assert (index_byte("hello", 'o' as u8) == 4); assert (index_byte("hello", 'z' as u8) == -1); + assert (index("hello", 'e') == option::some(1u)); + assert (index("hello", 'o') == option::some(4u)); + assert (index("hello", 'z') == option::none); assert (rindex_byte("hello", 'l' as u8) == 3); assert (rindex_byte("hello", 'h' as u8) == 0); assert (rindex_byte("hello", 'z' as u8) == -1); From 27161f4415e484680cf404b9819bf37d66c26783 Mon Sep 17 00:00:00 2001 From: Kevin Cantu Date: Sat, 11 Feb 2012 03:20:45 -0800 Subject: [PATCH 3/6] using str::index... --- src/cargo/cargo.rs | 34 ++++++++++++++++++---------------- src/comp/back/link.rs | 14 ++++++++------ src/comp/syntax/codemap.rs | 7 ++----- src/libcore/str.rs | 2 +- src/libstd/getopts.rs | 10 ++++------ 5 files changed, 33 insertions(+), 34 deletions(-) diff --git a/src/cargo/cargo.rs b/src/cargo/cargo.rs index aa042ea1ae468..ac1727ffdb7c8 100644 --- a/src/cargo/cargo.rs +++ b/src/cargo/cargo.rs @@ -651,25 +651,27 @@ fn cmd_install(c: cargo) unsafe { if str::starts_with(target, "uuid:") { let uuid = rest(target, 5u); - let idx = str::index_byte(uuid, '/' as u8); - if idx != -1 { - let source = str::unsafe::slice_bytes(uuid, 0u, idx as uint); - uuid = str::unsafe::slice_bytes(uuid, idx as uint + 1u, - str::byte_len(uuid)); - install_uuid_specific(c, wd, source, uuid); - } else { - install_uuid(c, wd, uuid); + alt str::index(uuid, '/') { + option::some(idx) { + let source = str::slice(uuid, 0u, idx); + uuid = str::slice(uuid, idx + 1u, str::char_len(uuid)); + install_uuid_specific(c, wd, source, uuid); + } + option::none { + install_uuid(c, wd, uuid); + } } } else { let name = target; - let idx = str::index_byte(name, '/' as u8); - if idx != -1 { - let source = str::unsafe::slice_bytes(name, 0u, idx as uint); - name = str::unsafe::slice_bytes(name, idx as uint + 1u, - str::byte_len(name)); - install_named_specific(c, wd, source, name); - } else { - install_named(c, wd, name); + alt str::index(name, '/') { + option::some(idx) { + let source = str::slice(name, 0u, idx); + name = str::slice(name, idx + 1u, str::char_len(name)); + install_named_specific(c, wd, source, name); + } + option::none { + install_named(c, wd, name); + } } } } diff --git a/src/comp/back/link.rs b/src/comp/back/link.rs index 4f32ad4a31fc7..5fa7841c97246 100644 --- a/src/comp/back/link.rs +++ b/src/comp/back/link.rs @@ -109,14 +109,16 @@ mod write { // Decides what to call an intermediate file, given the name of the output // and the extension to use. fn mk_intermediate_name(output_path: str, extension: str) -> str unsafe { - let dot_pos = str::index_byte(output_path, '.' as u8); - let stem; - if dot_pos < 0 { - stem = output_path; - } else { stem = str::unsafe::slice_bytes(output_path, 0u, - dot_pos as uint); } + let stem = alt str::index(output_path, '.') { + option::some(dot_pos) { + str::slice(output_path, 0u, dot_pos) + } + option::none { output_path } + }; + ret stem + "." + extension; } + fn run_passes(sess: session, llmod: ModuleRef, output: str) { let opts = sess.opts; if opts.time_llvm_passes { llvm::LLVMRustEnableTimePasses(); } diff --git a/src/comp/syntax/codemap.rs b/src/comp/syntax/codemap.rs index 27f968b1156f7..cb2590e195188 100644 --- a/src/comp/syntax/codemap.rs +++ b/src/comp/syntax/codemap.rs @@ -119,16 +119,13 @@ fn get_line(fm: filemap, line: int) -> str unsafe { let end: uint; if line as uint < vec::len(fm.lines) - 1u { end = fm.lines[line + 1].byte - fm.start_pos.byte; + ret str::unsafe::slice_bytes(*fm.src, begin, end); } else { // If we're not done parsing the file, we're at the limit of what's // parsed. If we just slice the rest of the string, we'll print out // the remainder of the file, which is undesirable. - end = str::byte_len(*fm.src); - let rest = str::unsafe::slice_bytes(*fm.src, begin, end); - let newline = str::index_byte(rest, '\n' as u8); - if newline != -1 { end = begin + (newline as uint); } + ret str::splitn_char(*fm.src, '\n', 1u)[0]; } - ret str::unsafe::slice_bytes(*fm.src, begin, end); } fn lookup_byte_offset(cm: codemap::codemap, chpos: uint) diff --git a/src/libcore/str.rs b/src/libcore/str.rs index 06a49b8d33992..9ff1fcfb59e14 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -70,7 +70,7 @@ export lines_iter, // Searching - //index, + index, //rindex, index_byte, rindex_byte, diff --git a/src/libstd/getopts.rs b/src/libstd/getopts.rs index 33674fe67f38d..8288501defc31 100644 --- a/src/libstd/getopts.rs +++ b/src/libstd/getopts.rs @@ -230,16 +230,14 @@ fn getopts(args: [str], opts: [opt]) -> result unsafe { let i_arg = option::none::; if cur[1] == '-' as u8 { let tail = str::unsafe::slice_bytes(cur, 2u, curlen); - let eq = str::index_byte(tail, '=' as u8); - if eq == -1 { + let tail_eq = str::splitn_char(tail, '=', 1u); + if vec::len(tail_eq) <= 1u { names = [long(tail)]; } else { names = - [long(str::unsafe::slice_bytes(tail,0u,eq as uint))]; + [long(tail_eq[0])]; i_arg = - option::some::(str::unsafe::slice_bytes(tail, - (eq as uint) + 1u, - curlen - 2u)); + option::some::(tail_eq[1]); } } else { let j = 1u; From 50360873f8f7abbe7232cdd8f89d5ce691711acc Mon Sep 17 00:00:00 2001 From: Kevin Cantu Date: Sat, 11 Feb 2012 05:03:03 -0800 Subject: [PATCH 4/6] (core::str) added rindex and rewrote pop_char with char_range_at_reverse --- src/libcore/str.rs | 104 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 91 insertions(+), 13 deletions(-) diff --git a/src/libcore/str.rs b/src/libcore/str.rs index 9ff1fcfb59e14..5a879fb0cf83f 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -71,7 +71,7 @@ export // Searching index, - //rindex, + rindex, index_byte, rindex_byte, find, @@ -255,15 +255,12 @@ Function: pop_char Remove the final character from a string and return it. Failure: - If the string does not contain any characters. */ fn pop_char(&s: str) -> char unsafe { let end = byte_len(s); - while end > 0u && s[end - 1u] & 192u8 == tag_cont_u8 { end -= 1u; } - assert (end > 0u); - let ch = char_at(s, end - 1u); - s = unsafe::slice_bytes(s, 0u, end - 1u); + let {ch:ch, prev:end} = char_range_at_reverse(s, end); + s = unsafe::slice_bytes(s, 0u, end); ret ch; } @@ -894,6 +891,28 @@ fn index(ss: str, cc: char) -> option { ret option::none; } +// Function: rindex +// +// Returns the index of the first matching char +// (as option some/none) +fn rindex(ss: str, cc: char) -> option { + let bii = byte_len(ss); + let cii = char_len(ss); + while bii > 0u { + let {ch, prev} = char_range_at_reverse(ss, bii); + cii -= 1u; + bii = prev; + + // found here? + if ch == cc { + ret option::some(cii); + } + } + + // wasn't found + ret option::none; +} + /* Function: index @@ -1259,6 +1278,25 @@ Pluck a character out of a string */ fn char_at(s: str, i: uint) -> char { ret char_range_at(s, i).ch; } +// Function: char_range_at_reverse +// +// Given a byte position and a str, return the previous char and its position +// This function can be used to iterate over a unicode string in reverse. +fn char_range_at_reverse(ss: str, start: uint) -> {ch: char, prev: uint} { + let prev = start; + + // while there is a previous byte == 10...... + while prev > 0u && ss[prev - 1u] & 192u8 == tag_cont_u8 { + prev -= 1u; + } + + // now refer to the initial byte of previous char + prev -= 1u; + + let ch = char_at(ss, prev); + ret {ch:ch, prev:prev}; +} + /* Function: substr_all @@ -1468,18 +1506,58 @@ mod tests { } #[test] - fn test_index_and_rindex() { - assert (index_byte("hello", 'e' as u8) == 1); - assert (index_byte("hello", 'o' as u8) == 4); - assert (index_byte("hello", 'z' as u8) == -1); - assert (index("hello", 'e') == option::some(1u)); - assert (index("hello", 'o') == option::some(4u)); - assert (index("hello", 'z') == option::none); + fn test_index() { + assert ( index("hello", 'h') == option::some(0u)); + assert ( index("hello", 'e') == option::some(1u)); + assert ( index("hello", 'o') == option::some(4u)); + assert ( index("hello", 'z') == option::none); + } + + #[test] + fn test_rindex() { + assert (rindex("hello", 'l') == option::some(3u)); + assert (rindex("hello", 'o') == option::some(4u)); + assert (rindex("hello", 'h') == option::some(0u)); + assert (rindex("hello", 'z') == option::none); + } + + #[test] + fn test_index_byte() { + assert ( index_byte("hello", 'e' as u8) == 1); + assert ( index_byte("hello", 'o' as u8) == 4); + assert ( index_byte("hello", 'z' as u8) == -1); + } + + #[test] + fn test_rindex_byte() { assert (rindex_byte("hello", 'l' as u8) == 3); assert (rindex_byte("hello", 'h' as u8) == 0); assert (rindex_byte("hello", 'z' as u8) == -1); } + #[test] + fn test_pop_char() { + let data = "ประเทศไทย中华"; + let cc = pop_char(data); + assert "ประเทศไทย中" == data; + assert '华' == cc; + } + + #[test] + fn test_pop_char_2() { + let data2 = "华"; + let cc2 = pop_char(data2); + assert "" == data2; + assert '华' == cc2; + } + + #[test] + #[should_fail] + fn test_pop_char_fail() { + let data = ""; + let _cc3 = pop_char(data); + } + #[test] fn test_split_byte() { fn t(s: str, c: char, u: [str]) { From e0af23b664a1307fe376f2638bb7a69f04e2ac1c Mon Sep 17 00:00:00 2001 From: Kevin Cantu Date: Sat, 11 Feb 2012 16:31:13 -0800 Subject: [PATCH 5/6] using str::rindex... --- src/fuzzer/fuzzer.rs | 7 +++---- src/libstd/fs.rs | 37 ++++++++++++++++++++----------------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/src/fuzzer/fuzzer.rs b/src/fuzzer/fuzzer.rs index e3f452f687040..9790ec02ff45b 100644 --- a/src/fuzzer/fuzzer.rs +++ b/src/fuzzer/fuzzer.rs @@ -283,10 +283,9 @@ fn check_variants_T( } } -fn last_part(filename: str) -> str unsafe { - let ix = str::rindex_byte(filename, 47u8 /* '/' */); - assert ix >= 0; - str::unsafe::slice_bytes(filename, ix as uint + 1u, str::byte_len(filename) - 3u) +fn last_part(filename: str) -> str { + let ix = option::get(str::rindex(filename, '/')); + str::slice(filename, ix + 1u, str::char_len(filename) - 3u) } enum happiness { passed, cleanly_rejected(str), known_bug(str), failed(str), } diff --git a/src/libstd/fs.rs b/src/libstd/fs.rs index 874d92cc4b9d4..de4789fdd3996 100644 --- a/src/libstd/fs.rs +++ b/src/libstd/fs.rs @@ -32,6 +32,22 @@ A path or fragment of a filesystem path */ type path = str; +fn splitDirnameBasename (pp: path) -> {dirname: str, basename: str} { + let ii; + alt str::rindex(pp, os_fs::path_sep) { + option::some(xx) { ii = xx; } + option::none { + alt str::rindex(pp, os_fs::alt_path_sep) { + option::some(xx) { ii = xx; } + option::none { ret {dirname: ".", basename: pp}; } + } + } + } + + ret {dirname: str::slice(pp, 0u, ii), + basename: str::slice(pp, ii + 1u, str::char_len(pp))}; +} + /* Function: dirname @@ -43,13 +59,8 @@ The dirname of "/usr/share" will be "/usr", but the dirname of If the path is not prefixed with a directory, then "." is returned. */ -fn dirname(p: path) -> path unsafe { - let i: int = str::rindex_byte(p, os_fs::path_sep as u8); - if i == -1 { - i = str::rindex_byte(p, os_fs::alt_path_sep as u8); - if i == -1 { ret "."; } - } - ret str::unsafe::slice_bytes(p, 0u, i as uint); +fn dirname(pp: path) -> path { + ret splitDirnameBasename(pp).dirname; } /* @@ -63,18 +74,10 @@ path separators in the path then the returned path is identical to the provided path. If an empty path is provided or the path ends with a path separator then an empty path is returned. */ -fn basename(p: path) -> path unsafe { - let i: int = str::rindex_byte(p, os_fs::path_sep as u8); - if i == -1 { - i = str::rindex_byte(p, os_fs::alt_path_sep as u8); - if i == -1 { ret p; } - } - let len = str::byte_len(p); - if (i + 1) as uint >= len { ret p; } - ret str::unsafe::slice_bytes(p, (i + 1) as uint, len); +fn basename(pp: path) -> path { + ret splitDirnameBasename(pp).basename; } - // FIXME: Need some typestate to avoid bounds check when len(pre) == 0 /* Function: connect From 207bb3d2df92f896145b3f2ef8aa5ca5cea00104 Mon Sep 17 00:00:00 2001 From: Kevin Cantu Date: Sat, 11 Feb 2012 17:04:08 -0800 Subject: [PATCH 6/6] (core::str) removed [r]index_byte --- src/libcore/str.rs | 44 -------------------------------------------- 1 file changed, 44 deletions(-) diff --git a/src/libcore/str.rs b/src/libcore/str.rs index 5a879fb0cf83f..ca7bb819443b6 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -72,8 +72,6 @@ export // Searching index, rindex, - index_byte, - rindex_byte, find, contains, starts_with, @@ -913,34 +911,6 @@ fn rindex(ss: str, cc: char) -> option { ret option::none; } -/* -Function: index - -Returns the index of the first matching byte. Returns -1 if -no match is found. - -FIXME: UTF-8 -*/ -fn index_byte(s: str, c: u8) -> int { - let i: int = 0; - for k: u8 in s { if k == c { ret i; } i += 1; } - ret -1; -} - -/* -Function: rindex - -Returns the index of the last matching byte. Returns -1 -if no match is found. - -FIXME: UTF-8 -*/ -fn rindex_byte(s: str, c: u8) -> int { - let n: int = byte_len(s) as int; - while n >= 0 { if s[n] == c { ret n; } n -= 1; } - ret n; -} - /* Function: find @@ -1521,20 +1491,6 @@ mod tests { assert (rindex("hello", 'z') == option::none); } - #[test] - fn test_index_byte() { - assert ( index_byte("hello", 'e' as u8) == 1); - assert ( index_byte("hello", 'o' as u8) == 4); - assert ( index_byte("hello", 'z' as u8) == -1); - } - - #[test] - fn test_rindex_byte() { - assert (rindex_byte("hello", 'l' as u8) == 3); - assert (rindex_byte("hello", 'h' as u8) == 0); - assert (rindex_byte("hello", 'z' as u8) == -1); - } - #[test] fn test_pop_char() { let data = "ประเทศไทย中华";