From fc06146aa4231a11c5c0f1976a800992566e6e7e Mon Sep 17 00:00:00 2001 From: Alex Ehlke Date: Thu, 25 Jul 2024 15:24:08 -0400 Subject: [PATCH 1/3] optimizations --- Sources/CharacterReader.swift | 69 +++++++++++++++++++---------------- Sources/Entities.swift | 3 +- Sources/TokeniserState.swift | 29 +++++++++------ 3 files changed, 57 insertions(+), 44 deletions(-) diff --git a/Sources/CharacterReader.swift b/Sources/CharacterReader.swift index d53c7950..becac185 100644 --- a/Sources/CharacterReader.swift +++ b/Sources/CharacterReader.swift @@ -43,18 +43,18 @@ public final class CharacterReader { return CharacterReader.EOF } let val = input[pos] - pos = input.index(after: pos) + input.formIndex(after: &pos) return val } public func unconsume() { guard pos > input.startIndex else { return } - pos = input.index(before: pos) + input.formIndex(before: &pos) } public func advance() { guard pos < input.endIndex else { return } - pos = input.index(after: pos) + input.formIndex(after: &pos) } public func markPos() { @@ -68,7 +68,7 @@ public final class CharacterReader { public func consumeAsString() -> String { guard pos < input.endIndex else { return "" } let str = String(input[pos]) - pos = input.index(after: pos) + input.formIndex(after: &pos) return str } @@ -100,7 +100,7 @@ public final class CharacterReader { var current = firstCharIx // Then manually match subsequent scalars for scalar in targetScalars.dropFirst() { - current = input.index(after: current) + input.formIndex(after: ¤t) guard current < input.endIndex else { return nil } if input[current] != scalar { start = input.index(after: firstCharIx) @@ -130,36 +130,37 @@ public final class CharacterReader { return consumed } - public func consumeToAny(_ chars: UnicodeScalar...) -> String { - return consumeToAny(chars) - } +// public func consumeToAny(_ chars: UnicodeScalar...) -> String { +// return consumeToAny(Set(chars)) +// } - public func consumeToAny(_ chars: [UnicodeScalar]) -> String { + public func consumeToAny(_ chars: Set) -> String { + let endIndex = input.endIndex let start = pos - while pos < input.endIndex { + while pos < endIndex { if chars.contains(input[pos]) { break } - pos = input.index(after: pos) + input.formIndex(after: &pos) } return cacheString(start, pos) } - public func consumeToAnySorted(_ chars: UnicodeScalar...) -> String { - return consumeToAny(chars) - } +// public func consumeToAnySorted(_ chars: UnicodeScalar...) -> String { +// return consumeToAny(chars) +// } - public func consumeToAnySorted(_ chars: [UnicodeScalar]) -> String { + public func consumeToAnySorted(_ chars: Set) -> String { return consumeToAny(chars) } - static let dataTerminators: [UnicodeScalar] = [.Ampersand, .LessThan, TokeniserStateVars.nullScalr] + static let dataTerminators: Set = Set([.Ampersand, .LessThan, TokeniserStateVars.nullScalr]) // read to &, <, or null public func consumeData() -> String { return consumeToAny(CharacterReader.dataTerminators) } - static let tagNameTerminators: [UnicodeScalar] = [.BackslashT, .BackslashN, .BackslashR, .BackslashF, .Space, .Slash, .GreaterThan, TokeniserStateVars.nullScalr] + static let tagNameTerminators: Set = Set([.BackslashT, .BackslashN, .BackslashR, .BackslashF, .Space, .Slash, .GreaterThan, TokeniserStateVars.nullScalr]) // read to '\t', '\n', '\r', '\f', ' ', '/', '>', or nullChar public func consumeTagName() -> String { return consumeToAny(CharacterReader.tagNameTerminators) @@ -173,10 +174,11 @@ public final class CharacterReader { public func consumeLetterSequence() -> String { let start = pos - while pos < input.endIndex { + let endIndex = input.endIndex + while pos < endIndex { let c = input[pos] if ((c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)) { - pos = input.index(after: pos) + input.formIndex(after: &pos) } else { break } @@ -186,18 +188,19 @@ public final class CharacterReader { public func consumeLetterThenDigitSequence() -> String { let start = pos - while pos < input.endIndex { + let endIndex = input.endIndex + while pos < endIndex { let c = input[pos] if ((c >= "A" && c <= "Z") || (c >= "a" && c <= "z") || c.isMemberOfCharacterSet(CharacterSet.letters)) { - pos = input.index(after: pos) + input.formIndex(after: &pos) } else { break } } - while pos < input.endIndex { + while pos < endIndex { let c = input[pos] if (c >= "0" && c <= "9") { - pos = input.index(after: pos) + input.formIndex(after: &pos) } else { break } @@ -207,10 +210,11 @@ public final class CharacterReader { public func consumeHexSequence() -> String { let start = pos - while pos < input.endIndex { + let endIndex = input.endIndex + while pos < endIndex { let c = input[pos] if ((c >= "0" && c <= "9") || (c >= "A" && c <= "F") || (c >= "a" && c <= "f")) { - pos = input.index(after: pos) + input.formIndex(after: &pos) } else { break } @@ -220,10 +224,11 @@ public final class CharacterReader { public func consumeDigitSequence() -> String { let start = pos - while pos < input.endIndex { + let endIndex = input.endIndex + while pos < endIndex { let c = input[pos] if (c >= "0" && c <= "9") { - pos = input.index(after: pos) + input.formIndex(after: &pos) } else { break } @@ -239,14 +244,16 @@ public final class CharacterReader { public func matches(_ seq: String, ignoreCase: Bool = false, consume: Bool = false) -> Bool { var current = pos let scalars = seq.unicodeScalars + let endIndex = input.endIndex for scalar in scalars { - guard current < input.endIndex else { return false } + guard current < endIndex else { return false } + let c = input[current] if ignoreCase { - guard input[current].uppercase == scalar.uppercase else { return false } + guard c.uppercase == scalar.uppercase else { return false } } else { - guard input[current] == scalar else { return false } + guard c == scalar else { return false } } - current = input.index(after: current) + input.formIndex(after: ¤t) } if consume { pos = current diff --git a/Sources/Entities.swift b/Sources/Entities.swift index b513301c..93b7ea54 100644 --- a/Sources/Entities.swift +++ b/Sources/Entities.swift @@ -51,7 +51,7 @@ public class Entities { return left.value != right.value } - private static let codeDelims: [UnicodeScalar] = [",", ";"] + private static let codeDelims: Set = Set([",", ";"]) init(string: String, size: Int, id: Int) { @@ -103,6 +103,7 @@ public class Entities { while ix < entitiesByCodepoint.endIndex && entitiesByCodepoint[ix].scalar == codepoint { matches.append(entitiesByCodepoint[ix].name) ix = entitiesByCodepoint.index(after: ix) + entitiesByCodepoint.formIndex(after: &ix) } return matches.isEmpty ? nil : matches.sorted().last! } diff --git a/Sources/TokeniserState.swift b/Sources/TokeniserState.swift index e55d79b0..1a23b674 100644 --- a/Sources/TokeniserState.swift +++ b/Sources/TokeniserState.swift @@ -15,10 +15,15 @@ protocol TokeniserStateProtocol { public class TokeniserStateVars { public static let nullScalr: UnicodeScalar = "\u{0000}" - static let attributeSingleValueCharsSorted = ["'", UnicodeScalar.Ampersand, nullScalr].sorted() - static let attributeDoubleValueCharsSorted = ["\"", UnicodeScalar.Ampersand, nullScalr].sorted() - static let attributeNameCharsSorted = [UnicodeScalar.BackslashT, "\n", "\r", UnicodeScalar.BackslashF, " ", "/", "=", ">", nullScalr, "\"", "'", UnicodeScalar.LessThan].sorted() - static let attributeValueUnquoted = [UnicodeScalar.BackslashT, "\n", "\r", UnicodeScalar.BackslashF, " ", UnicodeScalar.Ampersand, ">", nullScalr, "\"", "'", UnicodeScalar.LessThan, "=", "`"].sorted() + static let attributeSingleValueChars = Set(["'", UnicodeScalar.Ampersand, nullScalr]) + static let attributeDoubleValueChars = Set(["\"", UnicodeScalar.Ampersand, nullScalr]) + static let attributeNameChars = Set([UnicodeScalar.BackslashT, "\n", "\r", UnicodeScalar.BackslashF, " ", "/", "=", ">", nullScalr, "\"", "'", UnicodeScalar.LessThan]) + static let attributeValueUnquoted = Set([UnicodeScalar.BackslashT, "\n", "\r", UnicodeScalar.BackslashF, " ", UnicodeScalar.Ampersand, ">", nullScalr, "\"", "'", UnicodeScalar.LessThan, "=", "`"]) + + static let dataDefaultStopChars: Set = [UnicodeScalar.Ampersand, UnicodeScalar.LessThan, TokeniserStateVars.nullScalr] + static let commentDefaultStopChars: Set = ["-", TokeniserStateVars.nullScalr] + static let readDataDefaultStopChars: Set = [UnicodeScalar.LessThan, TokeniserStateVars.nullScalr] + static let replacementChar: UnicodeScalar = Tokeniser.replacementChar static let replacementStr: String = String(Tokeniser.replacementChar) @@ -137,7 +142,7 @@ enum TokeniserState: TokeniserStateProtocol { try t.emit(Token.EOF()) break default: - let data = r.consumeToAny(UnicodeScalar.Ampersand, UnicodeScalar.LessThan, TokeniserStateVars.nullScalr) + let data = r.consumeToAny(TokeniserStateVars.dataDefaultStopChars) t.emit(data) break } @@ -417,7 +422,7 @@ enum TokeniserState: TokeniserStateProtocol { t.emit(TokeniserStateVars.replacementChar) break default: - let data = r.consumeToAny("-", UnicodeScalar.LessThan, TokeniserStateVars.nullScalr) + let data = r.consumeToAny(TokeniserStateVars.dataDefaultStopChars) t.emit(data) } break @@ -528,7 +533,7 @@ enum TokeniserState: TokeniserStateProtocol { t.transition(.Data) break default: - let data = r.consumeToAny("-", UnicodeScalar.LessThan, TokeniserStateVars.nullScalr) + let data = r.consumeToAny(TokeniserStateVars.dataDefaultStopChars) t.emit(data) } break @@ -633,7 +638,7 @@ enum TokeniserState: TokeniserStateProtocol { } break case .AttributeName: - let name = r.consumeToAnySorted(TokeniserStateVars.attributeNameCharsSorted) + let name = r.consumeToAnySorted(TokeniserStateVars.attributeNameChars) t.tagPending.appendAttributeName(name) let c = r.consume() @@ -764,7 +769,7 @@ enum TokeniserState: TokeniserStateProtocol { } break case .AttributeValue_doubleQuoted: - let value = r.consumeToAny(TokeniserStateVars.attributeDoubleValueCharsSorted) + let value = r.consumeToAny(TokeniserStateVars.attributeDoubleValueChars) if (value.count > 0) { t.tagPending.appendAttributeValue(value) } else { @@ -798,7 +803,7 @@ enum TokeniserState: TokeniserStateProtocol { } break case .AttributeValue_singleQuoted: - let value = r.consumeToAny(TokeniserStateVars.attributeSingleValueCharsSorted) + let value = r.consumeToAny(TokeniserStateVars.attributeSingleValueChars) if (value.count > 0) { t.tagPending.appendAttributeValue(value) } else { @@ -1008,7 +1013,7 @@ enum TokeniserState: TokeniserStateProtocol { t.transition(.Data) break default: - t.commentPending.data.append(r.consumeToAny("-", TokeniserStateVars.nullScalr)) + t.commentPending.data.append(r.consumeToAny(TokeniserStateVars.commentDefaultStopChars)) } break case .CommentEndDash: @@ -1592,7 +1597,7 @@ enum TokeniserState: TokeniserStateProtocol { try t.emit(Token.EOF()) break default: - let data = r.consumeToAny(UnicodeScalar.LessThan, TokeniserStateVars.nullScalr) + let data = r.consumeToAny(TokeniserStateVars.readDataDefaultStopChars) t.emit(data) break } From 4ff5709f5a1bb9952a30029eae5a879aec7bd31d Mon Sep 17 00:00:00 2001 From: Alex Ehlke Date: Thu, 25 Jul 2024 15:26:23 -0400 Subject: [PATCH 2/3] wip --- Sources/Entities.swift | 1 - 1 file changed, 1 deletion(-) diff --git a/Sources/Entities.swift b/Sources/Entities.swift index 93b7ea54..8e91531d 100644 --- a/Sources/Entities.swift +++ b/Sources/Entities.swift @@ -102,7 +102,6 @@ public class Entities { var matches: [String] = [] while ix < entitiesByCodepoint.endIndex && entitiesByCodepoint[ix].scalar == codepoint { matches.append(entitiesByCodepoint[ix].name) - ix = entitiesByCodepoint.index(after: ix) entitiesByCodepoint.formIndex(after: &ix) } return matches.isEmpty ? nil : matches.sorted().last! From 86f4189748118d11ae4108c0134af8421b4689ab Mon Sep 17 00:00:00 2001 From: Alex Ehlke Date: Thu, 25 Jul 2024 20:22:01 -0400 Subject: [PATCH 3/3] wip --- Sources/CharacterReader.swift | 36 ++++++++++++++++++++++------------- Sources/TokeniserState.swift | 2 +- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/Sources/CharacterReader.swift b/Sources/CharacterReader.swift index becac185..91a90f84 100644 --- a/Sources/CharacterReader.swift +++ b/Sources/CharacterReader.swift @@ -65,12 +65,12 @@ public final class CharacterReader { pos = mark } - public func consumeAsString() -> String { - guard pos < input.endIndex else { return "" } - let str = String(input[pos]) - input.formIndex(after: &pos) - return str - } +// public func consumeAsString() -> String { +// guard pos < input.endIndex else { return "" } +// let str = String(input[pos]) +// input.formIndex(after: &pos) +// return str +// } /** * Locate the next occurrence of a Unicode scalar @@ -132,20 +132,30 @@ public final class CharacterReader { // public func consumeToAny(_ chars: UnicodeScalar...) -> String { // return consumeToAny(Set(chars)) +// } + +// public func consumeToAny(_ chars: Set) -> String { +// let endIndex = input.endIndex +// let start = pos +// while pos < endIndex { +// if chars.contains(input[pos]) { +// break +// } +// input.formIndex(after: &pos) +// } +// return cacheString(start, pos) // } public func consumeToAny(_ chars: Set) -> String { - let endIndex = input.endIndex let start = pos - while pos < endIndex { - if chars.contains(input[pos]) { - break - } - input.formIndex(after: &pos) + if let nextIndex = input[pos...].firstIndex(where: { chars.contains($0) }) { + pos = nextIndex + } else { + pos = input.endIndex } return cacheString(start, pos) } - + // public func consumeToAnySorted(_ chars: UnicodeScalar...) -> String { // return consumeToAny(chars) // } diff --git a/Sources/TokeniserState.swift b/Sources/TokeniserState.swift index 1a23b674..d58a6513 100644 --- a/Sources/TokeniserState.swift +++ b/Sources/TokeniserState.swift @@ -99,7 +99,7 @@ enum TokeniserState: TokeniserStateProtocol { case BogusDoctype case CdataSection - internal func read(_ t: Tokeniser, _ r: CharacterReader)throws { + internal func read(_ t: Tokeniser, _ r: CharacterReader) throws { switch self { case .Data: switch (r.current()) {