From 1df715bf4b36f2ccbddcc8cecaf5a27748751f3b Mon Sep 17 00:00:00 2001 From: ITSTAKE Date: Sun, 2 Jun 2019 23:32:08 +0900 Subject: [PATCH] Fixed allergy parsing problem, escaped html characters. --- .../kotlin/me/itstake/neisinfo/NeisParser.kt | 213 +++++++++++++++++- 1 file changed, 211 insertions(+), 2 deletions(-) diff --git a/src/main/kotlin/me/itstake/neisinfo/NeisParser.kt b/src/main/kotlin/me/itstake/neisinfo/NeisParser.kt index 50df2f2..8b237fb 100644 --- a/src/main/kotlin/me/itstake/neisinfo/NeisParser.kt +++ b/src/main/kotlin/me/itstake/neisinfo/NeisParser.kt @@ -1,5 +1,14 @@ package me.itstake.neisinfo +import java.io.StringWriter +import java.util.HashMap +import kotlin.collections.ArrayList +import kotlin.collections.List +import kotlin.collections.Map +import kotlin.collections.forEach +import kotlin.collections.hashMapOf +import kotlin.collections.set + class NeisParser { companion object { @@ -102,9 +111,10 @@ class NeisParser { menus.forEach { t -> val allergies = ArrayList() var name = t - for(i in 1..18) { + for (i in 18 downTo 1) { val index = t.indexOf("$i.") - if (index > -1) allergies.add(MealMenu.AllergyInfo.getByKey(i)); name = name.replace("$i.", "") + if (index > -1) allergies.add(MealMenu.AllergyInfo.getByKey(i)); name = + StringUtils.unescapeHtml3(name).replace("$i.", "") } ret.add(MealMenu(name.trim(), allergies)) } @@ -156,5 +166,204 @@ class NeisParser { val textarea = data.substring(textareaStart, data.indexOf("") + 11) return EventInfo(targetGrades = grades, details = textarea) } + + object StringUtils { + + private val ESCAPES = arrayOf( + arrayOf("\"", "quot"), // " - double-quote + arrayOf("&", "amp"), // & - ampersand + arrayOf("<", "lt"), // < - less-than + arrayOf(">", "gt"), // > - greater-than + + // Mapping to escape ISO-8859-1 characters to their named HTML 3.x equivalents. + arrayOf("\u00A0", "nbsp"), // non-breaking space + arrayOf("\u00A1", "iexcl"), // inverted exclamation mark + arrayOf("\u00A2", "cent"), // cent sign + arrayOf("\u00A3", "pound"), // pound sign + arrayOf("\u00A4", "curren"), // currency sign + arrayOf("\u00A5", "yen"), // yen sign = yuan sign + arrayOf("\u00A6", "brvbar"), // broken bar = broken vertical bar + arrayOf("\u00A7", "sect"), // section sign + arrayOf("\u00A8", "uml"), // diaeresis = spacing diaeresis + arrayOf("\u00A9", "copy"), // © - copyright sign + arrayOf("\u00AA", "ordf"), // feminine ordinal indicator + arrayOf("\u00AB", "laquo"), // left-pointing double angle quotation mark = left pointing guillemet + arrayOf("\u00AC", "not"), // not sign + arrayOf("\u00AD", "shy"), // soft hyphen = discretionary hyphen + arrayOf("\u00AE", "reg"), // ® - registered trademark sign + arrayOf("\u00AF", "macr"), // macron = spacing macron = overline = APL overbar + arrayOf("\u00B0", "deg"), // degree sign + arrayOf("\u00B1", "plusmn"), // plus-minus sign = plus-or-minus sign + arrayOf("\u00B2", "sup2"), // superscript two = superscript digit two = squared + arrayOf("\u00B3", "sup3"), // superscript three = superscript digit three = cubed + arrayOf("\u00B4", "acute"), // acute accent = spacing acute + arrayOf("\u00B5", "micro"), // micro sign + arrayOf("\u00B6", "para"), // pilcrow sign = paragraph sign + arrayOf("\u00B7", "middot"), // middle dot = Georgian comma = Greek middle dot + arrayOf("\u00B8", "cedil"), // cedilla = spacing cedilla + arrayOf("\u00B9", "sup1"), // superscript one = superscript digit one + arrayOf("\u00BA", "ordm"), // masculine ordinal indicator + arrayOf("\u00BB", "raquo"), // right-pointing double angle quotation mark = right pointing guillemet + arrayOf("\u00BC", "frac14"), // vulgar fraction one quarter = fraction one quarter + arrayOf("\u00BD", "frac12"), // vulgar fraction one half = fraction one half + arrayOf("\u00BE", "frac34"), // vulgar fraction three quarters = fraction three quarters + arrayOf("\u00BF", "iquest"), // inverted question mark = turned question mark + arrayOf("\u00C0", "Agrave"), // А - uppercase A, grave accent + arrayOf("\u00C1", "Aacute"), // Б - uppercase A, acute accent + arrayOf("\u00C2", "Acirc"), // В - uppercase A, circumflex accent + arrayOf("\u00C3", "Atilde"), // Г - uppercase A, tilde + arrayOf("\u00C4", "Auml"), // Д - uppercase A, umlaut + arrayOf("\u00C5", "Aring"), // Е - uppercase A, ring + arrayOf("\u00C6", "AElig"), // Ж - uppercase AE + arrayOf("\u00C7", "Ccedil"), // З - uppercase C, cedilla + arrayOf("\u00C8", "Egrave"), // И - uppercase E, grave accent + arrayOf("\u00C9", "Eacute"), // Й - uppercase E, acute accent + arrayOf("\u00CA", "Ecirc"), // К - uppercase E, circumflex accent + arrayOf("\u00CB", "Euml"), // Л - uppercase E, umlaut + arrayOf("\u00CC", "Igrave"), // М - uppercase I, grave accent + arrayOf("\u00CD", "Iacute"), // Н - uppercase I, acute accent + arrayOf("\u00CE", "Icirc"), // О - uppercase I, circumflex accent + arrayOf("\u00CF", "Iuml"), // П - uppercase I, umlaut + arrayOf("\u00D0", "ETH"), // Р - uppercase Eth, Icelandic + arrayOf("\u00D1", "Ntilde"), // С - uppercase N, tilde + arrayOf("\u00D2", "Ograve"), // Т - uppercase O, grave accent + arrayOf("\u00D3", "Oacute"), // У - uppercase O, acute accent + arrayOf("\u00D4", "Ocirc"), // Ф - uppercase O, circumflex accent + arrayOf("\u00D5", "Otilde"), // Х - uppercase O, tilde + arrayOf("\u00D6", "Ouml"), // Ц - uppercase O, umlaut + arrayOf("\u00D7", "times"), // multiplication sign + arrayOf("\u00D8", "Oslash"), // Ш - uppercase O, slash + arrayOf("\u00D9", "Ugrave"), // Щ - uppercase U, grave accent + arrayOf("\u00DA", "Uacute"), // Ъ - uppercase U, acute accent + arrayOf("\u00DB", "Ucirc"), // Ы - uppercase U, circumflex accent + arrayOf("\u00DC", "Uuml"), // Ь - uppercase U, umlaut + arrayOf("\u00DD", "Yacute"), // Э - uppercase Y, acute accent + arrayOf("\u00DE", "THORN"), // Ю - uppercase THORN, Icelandic + arrayOf("\u00DF", "szlig"), // Я - lowercase sharps, German + arrayOf("\u00E0", "agrave"), // а - lowercase a, grave accent + arrayOf("\u00E1", "aacute"), // б - lowercase a, acute accent + arrayOf("\u00E2", "acirc"), // в - lowercase a, circumflex accent + arrayOf("\u00E3", "atilde"), // г - lowercase a, tilde + arrayOf("\u00E4", "auml"), // д - lowercase a, umlaut + arrayOf("\u00E5", "aring"), // е - lowercase a, ring + arrayOf("\u00E6", "aelig"), // ж - lowercase ae + arrayOf("\u00E7", "ccedil"), // з - lowercase c, cedilla + arrayOf("\u00E8", "egrave"), // и - lowercase e, grave accent + arrayOf("\u00E9", "eacute"), // й - lowercase e, acute accent + arrayOf("\u00EA", "ecirc"), // к - lowercase e, circumflex accent + arrayOf("\u00EB", "euml"), // л - lowercase e, umlaut + arrayOf("\u00EC", "igrave"), // м - lowercase i, grave accent + arrayOf("\u00ED", "iacute"), // н - lowercase i, acute accent + arrayOf("\u00EE", "icirc"), // о - lowercase i, circumflex accent + arrayOf("\u00EF", "iuml"), // п - lowercase i, umlaut + arrayOf("\u00F0", "eth"), // р - lowercase eth, Icelandic + arrayOf("\u00F1", "ntilde"), // с - lowercase n, tilde + arrayOf("\u00F2", "ograve"), // т - lowercase o, grave accent + arrayOf("\u00F3", "oacute"), // у - lowercase o, acute accent + arrayOf("\u00F4", "ocirc"), // ф - lowercase o, circumflex accent + arrayOf("\u00F5", "otilde"), // х - lowercase o, tilde + arrayOf("\u00F6", "ouml"), // ц - lowercase o, umlaut + arrayOf("\u00F7", "divide"), // division sign + arrayOf("\u00F8", "oslash"), // ш - lowercase o, slash + arrayOf("\u00F9", "ugrave"), // щ - lowercase u, grave accent + arrayOf("\u00FA", "uacute"), // ъ - lowercase u, acute accent + arrayOf("\u00FB", "ucirc"), // ы - lowercase u, circumflex accent + arrayOf("\u00FC", "uuml"), // ь - lowercase u, umlaut + arrayOf("\u00FD", "yacute"), // э - lowercase y, acute accent + arrayOf("\u00FE", "thorn"), // ю - lowercase thorn, Icelandic + arrayOf("\u00FF", "yuml") + )// я - lowercase y, umlaut + + private const val MIN_ESCAPE = 2 + private const val MAX_ESCAPE = 6 + + private val lookupMap: HashMap = HashMap() + + fun unescapeHtml3(input: String): String { + var writer: StringWriter? = null + val len = input.length + var i = 1 + var st = 0 + while (true) { + // look for '&' + while (i < len && input[i - 1] != '&') + i++ + if (i >= len) + break + + // found '&', look for ';' + var j = i + while (j < len && j < i + MAX_ESCAPE + 1 && input[j] != ';') + j++ + if (j == len || j < i + MIN_ESCAPE || j == i + MAX_ESCAPE + 1) { + i++ + continue + } + + // found escape + if (input[i] == '#') { + // numeric escape + var k = i + 1 + var radix = 10 + + val firstChar = input[k] + if (firstChar == 'x' || firstChar == 'X') { + k++ + radix = 16 + } + + try { + val entityValue = Integer.parseInt(input.substring(k, j), radix) + + if (writer == null) + writer = StringWriter(input.length) + writer.append(input.substring(st, i - 1)) + + if (entityValue > 0xFFFF) { + val chrs = Character.toChars(entityValue) + writer.write(chrs[0].toInt()) + writer.write(chrs[1].toInt()) + } else { + writer.write(entityValue) + } + + } catch (ex: NumberFormatException) { + i++ + continue + } + + } else { + // named escape + val value = lookupMap[input.substring(i, j)] + if (value == null) { + i++ + continue + } + + if (writer == null) + writer = StringWriter(input.length) + writer.append(input.substring(st, i - 1)) + + writer.append(value) + } + + // skip escape + st = j + 1 + i = st + } + + if (writer != null) { + writer.append(input.substring(st, len)) + return writer.toString() + } + return input + } + + init { + for (seq in ESCAPES) + lookupMap[seq[1]] = seq[0] + } + + } } } \ No newline at end of file