Skip to content

Commit

Permalink
Fixed allergy parsing problem, escaped html characters.
Browse files Browse the repository at this point in the history
  • Loading branch information
EATSTEAK committed Jun 2, 2019
1 parent c99a40b commit 1df715b
Showing 1 changed file with 211 additions and 2 deletions.
213 changes: 211 additions & 2 deletions src/main/kotlin/me/itstake/neisinfo/NeisParser.kt
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
package me.itstake.neisinfo

import java.io.StringWriter
import java.util.HashMap
import kotlin.collections.ArrayList
import kotlin.collections.List
import kotlin.collections.Map
import kotlin.collections.forEach
import kotlin.collections.hashMapOf
import kotlin.collections.set


class NeisParser {
companion object {
Expand Down Expand Up @@ -102,9 +111,10 @@ class NeisParser {
menus.forEach { t ->
val allergies = ArrayList<MealMenu.AllergyInfo>()
var name = t
for(i in 1..18) {
for (i in 18 downTo 1) {
val index = t.indexOf("$i.")
if (index > -1) allergies.add(MealMenu.AllergyInfo.getByKey(i)); name = name.replace("$i.", "")
if (index > -1) allergies.add(MealMenu.AllergyInfo.getByKey(i)); name =
StringUtils.unescapeHtml3(name).replace("$i.", "")
}
ret.add(MealMenu(name.trim(), allergies))
}
Expand Down Expand Up @@ -156,5 +166,204 @@ class NeisParser {
val textarea = data.substring(textareaStart, data.indexOf("</textarea>") + 11)
return EventInfo(targetGrades = grades, details = textarea)
}

object StringUtils {

private val ESCAPES = arrayOf(
arrayOf("\"", "quot"), // " - double-quote
arrayOf("&", "amp"), // & - ampersand
arrayOf("<", "lt"), // < - less-than
arrayOf(">", "gt"), // > - greater-than

// Mapping to escape ISO-8859-1 characters to their named HTML 3.x equivalents.
arrayOf("\u00A0", "nbsp"), // non-breaking space
arrayOf("\u00A1", "iexcl"), // inverted exclamation mark
arrayOf("\u00A2", "cent"), // cent sign
arrayOf("\u00A3", "pound"), // pound sign
arrayOf("\u00A4", "curren"), // currency sign
arrayOf("\u00A5", "yen"), // yen sign = yuan sign
arrayOf("\u00A6", "brvbar"), // broken bar = broken vertical bar
arrayOf("\u00A7", "sect"), // section sign
arrayOf("\u00A8", "uml"), // diaeresis = spacing diaeresis
arrayOf("\u00A9", "copy"), // © - copyright sign
arrayOf("\u00AA", "ordf"), // feminine ordinal indicator
arrayOf("\u00AB", "laquo"), // left-pointing double angle quotation mark = left pointing guillemet
arrayOf("\u00AC", "not"), // not sign
arrayOf("\u00AD", "shy"), // soft hyphen = discretionary hyphen
arrayOf("\u00AE", "reg"), // ® - registered trademark sign
arrayOf("\u00AF", "macr"), // macron = spacing macron = overline = APL overbar
arrayOf("\u00B0", "deg"), // degree sign
arrayOf("\u00B1", "plusmn"), // plus-minus sign = plus-or-minus sign
arrayOf("\u00B2", "sup2"), // superscript two = superscript digit two = squared
arrayOf("\u00B3", "sup3"), // superscript three = superscript digit three = cubed
arrayOf("\u00B4", "acute"), // acute accent = spacing acute
arrayOf("\u00B5", "micro"), // micro sign
arrayOf("\u00B6", "para"), // pilcrow sign = paragraph sign
arrayOf("\u00B7", "middot"), // middle dot = Georgian comma = Greek middle dot
arrayOf("\u00B8", "cedil"), // cedilla = spacing cedilla
arrayOf("\u00B9", "sup1"), // superscript one = superscript digit one
arrayOf("\u00BA", "ordm"), // masculine ordinal indicator
arrayOf("\u00BB", "raquo"), // right-pointing double angle quotation mark = right pointing guillemet
arrayOf("\u00BC", "frac14"), // vulgar fraction one quarter = fraction one quarter
arrayOf("\u00BD", "frac12"), // vulgar fraction one half = fraction one half
arrayOf("\u00BE", "frac34"), // vulgar fraction three quarters = fraction three quarters
arrayOf("\u00BF", "iquest"), // inverted question mark = turned question mark
arrayOf("\u00C0", "Agrave"), // А - uppercase A, grave accent
arrayOf("\u00C1", "Aacute"), // Б - uppercase A, acute accent
arrayOf("\u00C2", "Acirc"), // В - uppercase A, circumflex accent
arrayOf("\u00C3", "Atilde"), // Г - uppercase A, tilde
arrayOf("\u00C4", "Auml"), // Д - uppercase A, umlaut
arrayOf("\u00C5", "Aring"), // Е - uppercase A, ring
arrayOf("\u00C6", "AElig"), // Ж - uppercase AE
arrayOf("\u00C7", "Ccedil"), // З - uppercase C, cedilla
arrayOf("\u00C8", "Egrave"), // И - uppercase E, grave accent
arrayOf("\u00C9", "Eacute"), // Й - uppercase E, acute accent
arrayOf("\u00CA", "Ecirc"), // К - uppercase E, circumflex accent
arrayOf("\u00CB", "Euml"), // Л - uppercase E, umlaut
arrayOf("\u00CC", "Igrave"), // М - uppercase I, grave accent
arrayOf("\u00CD", "Iacute"), // Н - uppercase I, acute accent
arrayOf("\u00CE", "Icirc"), // О - uppercase I, circumflex accent
arrayOf("\u00CF", "Iuml"), // П - uppercase I, umlaut
arrayOf("\u00D0", "ETH"), // Р - uppercase Eth, Icelandic
arrayOf("\u00D1", "Ntilde"), // С - uppercase N, tilde
arrayOf("\u00D2", "Ograve"), // Т - uppercase O, grave accent
arrayOf("\u00D3", "Oacute"), // У - uppercase O, acute accent
arrayOf("\u00D4", "Ocirc"), // Ф - uppercase O, circumflex accent
arrayOf("\u00D5", "Otilde"), // Х - uppercase O, tilde
arrayOf("\u00D6", "Ouml"), // Ц - uppercase O, umlaut
arrayOf("\u00D7", "times"), // multiplication sign
arrayOf("\u00D8", "Oslash"), // Ш - uppercase O, slash
arrayOf("\u00D9", "Ugrave"), // Щ - uppercase U, grave accent
arrayOf("\u00DA", "Uacute"), // Ъ - uppercase U, acute accent
arrayOf("\u00DB", "Ucirc"), // Ы - uppercase U, circumflex accent
arrayOf("\u00DC", "Uuml"), // Ь - uppercase U, umlaut
arrayOf("\u00DD", "Yacute"), // Э - uppercase Y, acute accent
arrayOf("\u00DE", "THORN"), // Ю - uppercase THORN, Icelandic
arrayOf("\u00DF", "szlig"), // Я - lowercase sharps, German
arrayOf("\u00E0", "agrave"), // а - lowercase a, grave accent
arrayOf("\u00E1", "aacute"), // б - lowercase a, acute accent
arrayOf("\u00E2", "acirc"), // в - lowercase a, circumflex accent
arrayOf("\u00E3", "atilde"), // г - lowercase a, tilde
arrayOf("\u00E4", "auml"), // д - lowercase a, umlaut
arrayOf("\u00E5", "aring"), // е - lowercase a, ring
arrayOf("\u00E6", "aelig"), // ж - lowercase ae
arrayOf("\u00E7", "ccedil"), // з - lowercase c, cedilla
arrayOf("\u00E8", "egrave"), // и - lowercase e, grave accent
arrayOf("\u00E9", "eacute"), // й - lowercase e, acute accent
arrayOf("\u00EA", "ecirc"), // к - lowercase e, circumflex accent
arrayOf("\u00EB", "euml"), // л - lowercase e, umlaut
arrayOf("\u00EC", "igrave"), // м - lowercase i, grave accent
arrayOf("\u00ED", "iacute"), // н - lowercase i, acute accent
arrayOf("\u00EE", "icirc"), // о - lowercase i, circumflex accent
arrayOf("\u00EF", "iuml"), // п - lowercase i, umlaut
arrayOf("\u00F0", "eth"), // р - lowercase eth, Icelandic
arrayOf("\u00F1", "ntilde"), // с - lowercase n, tilde
arrayOf("\u00F2", "ograve"), // т - lowercase o, grave accent
arrayOf("\u00F3", "oacute"), // у - lowercase o, acute accent
arrayOf("\u00F4", "ocirc"), // ф - lowercase o, circumflex accent
arrayOf("\u00F5", "otilde"), // х - lowercase o, tilde
arrayOf("\u00F6", "ouml"), // ц - lowercase o, umlaut
arrayOf("\u00F7", "divide"), // division sign
arrayOf("\u00F8", "oslash"), // ш - lowercase o, slash
arrayOf("\u00F9", "ugrave"), // щ - lowercase u, grave accent
arrayOf("\u00FA", "uacute"), // ъ - lowercase u, acute accent
arrayOf("\u00FB", "ucirc"), // ы - lowercase u, circumflex accent
arrayOf("\u00FC", "uuml"), // ь - lowercase u, umlaut
arrayOf("\u00FD", "yacute"), // э - lowercase y, acute accent
arrayOf("\u00FE", "thorn"), // ю - lowercase thorn, Icelandic
arrayOf("\u00FF", "yuml")
)// я - lowercase y, umlaut

private const val MIN_ESCAPE = 2
private const val MAX_ESCAPE = 6

private val lookupMap: HashMap<String, CharSequence> = HashMap()

fun unescapeHtml3(input: String): String {
var writer: StringWriter? = null
val len = input.length
var i = 1
var st = 0
while (true) {
// look for '&'
while (i < len && input[i - 1] != '&')
i++
if (i >= len)
break

// found '&', look for ';'
var j = i
while (j < len && j < i + MAX_ESCAPE + 1 && input[j] != ';')
j++
if (j == len || j < i + MIN_ESCAPE || j == i + MAX_ESCAPE + 1) {
i++
continue
}

// found escape
if (input[i] == '#') {
// numeric escape
var k = i + 1
var radix = 10

val firstChar = input[k]
if (firstChar == 'x' || firstChar == 'X') {
k++
radix = 16
}

try {
val entityValue = Integer.parseInt(input.substring(k, j), radix)

if (writer == null)
writer = StringWriter(input.length)
writer.append(input.substring(st, i - 1))

if (entityValue > 0xFFFF) {
val chrs = Character.toChars(entityValue)
writer.write(chrs[0].toInt())
writer.write(chrs[1].toInt())
} else {
writer.write(entityValue)
}

} catch (ex: NumberFormatException) {
i++
continue
}

} else {
// named escape
val value = lookupMap[input.substring(i, j)]
if (value == null) {
i++
continue
}

if (writer == null)
writer = StringWriter(input.length)
writer.append(input.substring(st, i - 1))

writer.append(value)
}

// skip escape
st = j + 1
i = st
}

if (writer != null) {
writer.append(input.substring(st, len))
return writer.toString()
}
return input
}

init {
for (seq in ESCAPES)
lookupMap[seq[1]] = seq[0]
}

}
}
}

0 comments on commit 1df715b

Please # to comment.