|
| 1 | +package cc.unitmesh.tools.web |
| 2 | + |
| 3 | +import cc.unitmesh.agent.Tool |
| 4 | +import org.jsoup.Jsoup |
| 5 | +import java.net.URLEncoder |
| 6 | +import java.nio.charset.StandardCharsets |
| 7 | + |
| 8 | + |
| 9 | +@Tool(name = "wikimedia", value = ["wikimedia"]) |
| 10 | +class Wikimedia { |
| 11 | + val headers = mapOf( |
| 12 | + "User-Agent" to "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " + |
| 13 | + "Chrome/117.0.0.0 Safari/537.36 Edg/113.0.1774.35" |
| 14 | + ) |
| 15 | + |
| 16 | + private fun removeNestedParentheses(string: String): String { |
| 17 | + val pattern = "\\([^()]+\\)".toRegex() |
| 18 | + var result = string |
| 19 | + while (pattern.containsMatchIn(result)) { |
| 20 | + result = pattern.replace(result, "") |
| 21 | + } |
| 22 | + return result |
| 23 | + } |
| 24 | + |
| 25 | + val urlList = mutableListOf<String>() |
| 26 | + fun run(title: String): MutableList<String> { |
| 27 | + val url = url(title) |
| 28 | + val doc = Jsoup.connect(url).headers(headers).get() |
| 29 | + val mwDivs = doc.select("div.mw-search-result-heading") |
| 30 | + if (mwDivs.isNotEmpty()) { |
| 31 | + val resultTitles = mwDivs.map { it.text().trim() } |
| 32 | + .map { removeNestedParentheses(it) } |
| 33 | + |
| 34 | + urlList.addAll(resultTitles.map { url(it) }) |
| 35 | + } else { |
| 36 | + val pageContent = (doc.select("p") + doc.select("ul")).map { it.text().trim() } |
| 37 | + if (pageContent.any { "may refer to:" in it }) { |
| 38 | + urlList.addAll(run("[$title]")) |
| 39 | + } else { |
| 40 | + urlList.add(url) |
| 41 | + } |
| 42 | + } |
| 43 | + |
| 44 | + return urlList |
| 45 | + } |
| 46 | + |
| 47 | + private fun url(it: String) = |
| 48 | + "https://en.wikipedia.org/w/index.php?search=${URLEncoder.encode(it, StandardCharsets.UTF_8)}" |
| 49 | +} |
0 commit comments