From 8bd67b70cb86f571c6b166804c06c11d6f64f5a5 Mon Sep 17 00:00:00 2001 From: Stefano Volpe Date: Mon, 17 Jun 2024 18:17:37 +0200 Subject: [PATCH 01/11] chore: refactor --- .github/dependabot.yml | 6 ++ .gitmodules | 3 + config | 1 + src/degrees.rs | 188 ++++++++++++++++++++++++++++++++++++++ src/main.rs | 198 ++--------------------------------------- src/teachings.rs | 1 + 6 files changed, 206 insertions(+), 191 deletions(-) create mode 100644 .gitmodules create mode 160000 config create mode 100644 src/degrees.rs create mode 100644 src/teachings.rs diff --git a/.github/dependabot.yml b/.github/dependabot.yml index e8d486a..ca6709c 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -9,3 +9,9 @@ updates: directory: "/" # Location of package manifests schedule: interval: "weekly" + - package-ecosystem: "gitsubmodule" + directory: "/" + allow: + - dependency-name: "json/config" + schedule: + interval: "daily" diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..37dee21 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "config"] + path = config + url = git@github.com:csunibo/config diff --git a/config b/config new file mode 160000 index 0000000..f3a84ec --- /dev/null +++ b/config @@ -0,0 +1 @@ +Subproject commit f3a84ec3afeaaa18d620384ff17cc58a665e9ead diff --git a/src/degrees.rs b/src/degrees.rs new file mode 100644 index 0000000..4822b4b --- /dev/null +++ b/src/degrees.rs @@ -0,0 +1,188 @@ +use std::{collections::HashMap, fmt::Write, path::Path}; + +use eyre::{eyre, Result}; +use itertools::Itertools; +use lazy_static::lazy_static; +use scraper::Selector; +use substring::Substring; + +lazy_static! { + static ref TABLE: Selector = scraper::Selector::parse("td.title").unwrap(); + static ref TITLE: Selector = scraper::Selector::parse("div#u-content-intro>h1").unwrap(); + static ref LANG: Selector = scraper::Selector::parse("li.language-en").unwrap(); + static ref DESC: Selector = scraper::Selector::parse("div.description-text").unwrap(); + static ref DESC_END_MARKER: HashMap = [ + ("Numerical Computing".to_string(), "Teaching".to_string()), + ("History of Informatics".to_string(), "Office".to_string()), + ("*".to_string(), "Readings".to_string()) + ] + .into(); + static ref MISSING_TRANSLATIONS: HashMap = [ + ("BASI DI DATI".to_string(), "DATABASES".to_string()), + ( + "INTRODUZIONE ALL'APPRENDIMENTO AUTOMATICO".to_string(), + "Introduction to machine learning".to_string() + ), + ("FONDAMENTI DI".to_string(), "".to_string()), + ( + "Learning outcomes".to_string(), + "=== Learning outcomes".to_string() + ), + ( + "Degree contents".to_string(), + "=== Degree contents".to_string() + ) + ] + .into(); +} + +pub struct Degree { + pub name: &'static str, + pub slug: &'static str, + pub url: &'static str, +} + +pub const DEGREES: &[Degree] = &[ + Degree { + name: "Informatica", + slug: "informatica", + url: "https://corsi.unibo.it/laurea/informatica/insegnamenti/piano/2022/8009/000/000/2022" + }, + Degree { + name: "Ingegneria Informatica", + slug: "ing-informatica", + url: "https://corsi.unibo.it/laurea/IngegneriaInformatica/insegnamenti/piano/2021/9254/000/000/2021" + } +]; + +fn get_eng_url(url: &str) -> Result { + if url.is_empty() { + return Ok("".to_string()); + } + + let res = reqwest::blocking::get(url)?.text()?; + let document = scraper::Html::parse_document(&res); + let mut link_ite = document.select(&LANG).map(|x| x.inner_html()); + + link_ite.next().ok_or(eyre!("Cannot get english url")) +} + +fn get_desc_degree_page(url: &str) -> Result { + let eng_url_temp = get_eng_url(url)?; + + // ignore language requirements + pseudocourses with no webpages + if eng_url_temp.contains("26338") || eng_url_temp.is_empty() { + return Ok("".to_string()); + } + + let start = eng_url_temp.find("http").unwrap_or(0); + let tmp = eng_url_temp.substring(start, eng_url_temp.len()); + let end = tmp.find('\"').unwrap_or(0); + let degree_url = tmp.substring(0, end); + + let eng_page = reqwest::blocking::get(degree_url)?.text()?; + let document = scraper::Html::parse_document(&eng_page); + + let degree_title = document + .select(&TITLE) + .next() + .ok_or(eyre!("Cannot parse degree title"))? + .text() + .join(""); + + let full_description = document + .select(&DESC) + .next() + .ok_or(eyre!("Cannot parse degree description"))? + .text() + .join(""); + + let i = full_description + .find("Learning outcomes") + .unwrap_or(full_description.len()); + + let mut f: Option = DESC_END_MARKER + .get("*") + .and_then(|marker| full_description.find(marker)); + + for (pattern, marker) in DESC_END_MARKER.iter() { + if degree_title.contains(pattern.as_str()) { + f = full_description + .find(marker) + .or(Some(full_description.len())); + break; + } + } + + let filtered_description = full_description + .substring( + i, + f.ok_or(eyre!( + "No description end marker defined for this page content" + ))? - 2, + ) + .split('\n') + .map(|item| item.trim()) + .filter(|item| !item.is_empty()) + .join("\n\n"); + + Ok(format!( + "\n== {}[{}]\n{}", + degree_url, + degree_title.as_str(), + filtered_description.trim() + )) +} + +pub fn analyze_degree( + degree_name: &str, + output_file: &Path, + teachings_url: &str, +) -> Result<(), eyre::ErrReport> { + let res = reqwest::blocking::get(teachings_url)?.text()?; + + let document = scraper::Html::parse_document(&res); + let title_list = document.select(&TABLE); + + let mut buf = format!("= {degree_name}\n\n"); + + for item in title_list { + let mut entry_doc = "".to_string(); + + let a_el = item + .children() + .filter_map(|f| f.value().as_element()) + .find(|r| r.name() == "a") + .map(|a_el| a_el.attr("href")) + .flatten(); + + let teaching_url = match a_el { + Some(a) => a, + None => { + eprintln!("Cannot parse an element: {}", item.text().join("").trim()); + continue; + } + }; + + print!("Visiting {}", teaching_url); + let teaching_desc = match get_desc_degree_page(teaching_url) { + Ok(desc) => desc, + Err(e) => { + eprintln!("Cannot get teaching description: {}", e); + continue; + } + }; + + entry_doc += "\n"; + entry_doc += teaching_desc.as_str(); + + for (source, replacement) in MISSING_TRANSLATIONS.iter() { + entry_doc = entry_doc.replace(source, replacement); + } + + buf.write_str(&entry_doc)?; + println!("\t✓"); + } + std::fs::write(output_file, buf)?; + Ok(()) +} diff --git a/src/main.rs b/src/main.rs index 4dec024..e16c6fa 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,140 +1,9 @@ -use std::{collections::HashMap, fmt::Write, fs, path::Path}; +use std::{fmt::Write, fs, path::Path}; -use eyre::{eyre, Result}; -use itertools::Itertools; -use lazy_static::lazy_static; -use scraper::Selector; -use substring::Substring; +pub mod degrees; +use degrees::{analyze_degree, Degree, DEGREES}; -struct Course { - name: &'static str, - slug: &'static str, - url: &'static str, -} - -const COURSES: &[Course] = &[ - Course { - name: "Informatica 2022", - slug: "informatica-2022-2022", - url: "https://corsi.unibo.it/laurea/informatica/insegnamenti/piano/2022/8009/000/000/2022" - }, - Course { - name: "Ingegneria Informatica 2022", - slug: "ing-informatica-2022-2022", - url: "https://corsi.unibo.it/laurea/IngegneriaInformatica/insegnamenti/piano/2021/9254/000/000/2021" - } -]; - -lazy_static! { - static ref TABLE: Selector = scraper::Selector::parse("td.title").unwrap(); - static ref TITLE: Selector = scraper::Selector::parse("div#u-content-intro>h1").unwrap(); - static ref LANG: Selector = scraper::Selector::parse("li.language-en").unwrap(); - static ref DESC: Selector = scraper::Selector::parse("div.description-text").unwrap(); - static ref DESC_END_MARKER: HashMap = [ - ("Numerical Computing".to_string(), "Teaching".to_string()), - ("History of Informatics".to_string(), "Office".to_string()), - ("*".to_string(), "Readings".to_string()) - ] - .into(); - static ref MISSING_TRANSLATIONS: HashMap = [ - ("BASI DI DATI".to_string(), "DATABASES".to_string()), - ( - "INTRODUZIONE ALL'APPRENDIMENTO AUTOMATICO".to_string(), - "Introduction to machine learning".to_string() - ), - ("FONDAMENTI DI".to_string(), "".to_string()), - ( - "Learning outcomes".to_string(), - "=== Learning outcomes".to_string() - ), - ( - "Course contents".to_string(), - "=== Course contents".to_string() - ) - ] - .into(); -} - -fn get_desc_course_page(url: &str) -> Result { - let eng_url_temp = get_eng_url(url)?; - - // escludo l'idoneità di inglese e i corsi che non hanno una pagina (prova finale, tirocinio, corsi non attivi...) - if eng_url_temp.contains("26338") || eng_url_temp.is_empty() { - return Ok("".to_string()); - } - - let start = eng_url_temp.find("http").unwrap_or(0); - let tmp = eng_url_temp.substring(start, eng_url_temp.len()); - let end = tmp.find('\"').unwrap_or(0); - let course_url = tmp.substring(0, end); - - let eng_page = reqwest::blocking::get(course_url)?.text()?; - let document = scraper::Html::parse_document(&eng_page); - - let course_title = document - .select(&TITLE) - .next() - .ok_or(eyre!("Cannot parse course title"))? - .text() - .join(""); - - let full_description = document - .select(&DESC) - .next() - .ok_or(eyre!("Cannot parse course description"))? - .text() - .join(""); - - let i = full_description - .find("Learning outcomes") - .unwrap_or(full_description.len()); - - let mut f: Option = DESC_END_MARKER - .get("*") - .and_then(|marker| full_description.find(marker)); - - for (pattern, marker) in DESC_END_MARKER.iter() { - if course_title.contains(pattern.as_str()) { - f = full_description - .find(marker) - .or(Some(full_description.len())); - break; - } - } - - let filtered_description = full_description - .substring( - i, - f.ok_or(eyre!( - "No description end marker defined for this page content" - ))? - 2, - ) - .split('\n') - .map(|item| item.trim()) - .filter(|item| !item.is_empty()) - .join("\n\n"); - - Ok(format!( - "\n== {}[{}]\n{}", - course_url, - course_title.as_str(), - filtered_description.trim() - )) -} - -fn get_eng_url(url: &str) -> Result { - if url.is_empty() { - return Ok("".to_string()); - } - - let res = reqwest::blocking::get(url)?.text()?; - let document = scraper::Html::parse_document(&res); - let mut link_ite = document.select(&LANG).map(|x| x.inner_html()); - - link_ite.next().ok_or(eyre!("Cannot get english url")) -} - -fn main() -> Result<()> { +fn main() -> Result<(), eyre::Report> { color_eyre::install()?; let output_dir = Path::new("output"); @@ -144,65 +13,12 @@ fn main() -> Result<()> { let mut index = "= Index\n\n".to_owned(); - for Course { slug, name, url } in COURSES { - analyze_course(name, &output_dir.join(format!("course-{}.adoc", slug)), url)?; - write!(index, "* xref:course-{}.adoc[{}]\n", slug, name)?; + for Degree { slug, name, url } in DEGREES { + analyze_degree(name, &output_dir.join(format!("degree-{}.adoc", slug)), url)?; + write!(index, "* xref:degree-{}.adoc[{}]\n", slug, name)?; } fs::write(output_dir.join("index.adoc"), index)?; Ok(()) } - -fn analyze_course( - course_name: &str, - output_file: &Path, - insegnamenti_url: &str, -) -> Result<(), eyre::ErrReport> { - let res = reqwest::blocking::get(insegnamenti_url)?.text()?; - - let document = scraper::Html::parse_document(&res); - let title_list = document.select(&TABLE); - - let mut buf = format!("= {course_name}\n\n"); - - for item in title_list { - let mut entry_doc = "".to_string(); - - let a_el = item - .children() - .filter_map(|f| f.value().as_element()) - .find(|r| r.name() == "a") - .map(|a_el| a_el.attr("href")) - .flatten(); - - let course_url = match a_el { - Some(a) => a, - None => { - eprintln!("Cannot parse an element: {}", item.text().join("").trim()); - continue; - } - }; - - print!("Visiting {}", course_url); - let course_desc = match get_desc_course_page(course_url) { - Ok(desc) => desc, - Err(e) => { - eprintln!("Cannot get course description: {}", e); - continue; - } - }; - - entry_doc += "\n"; - entry_doc += course_desc.as_str(); - - for (source, replacement) in MISSING_TRANSLATIONS.iter() { - entry_doc = entry_doc.replace(source, replacement); - } - - buf.write_str(&entry_doc)?; - println!("\t✓"); - } - std::fs::write(output_file, buf)?; - Ok(()) -} diff --git a/src/teachings.rs b/src/teachings.rs new file mode 100644 index 0000000..0a6023e --- /dev/null +++ b/src/teachings.rs @@ -0,0 +1 @@ +mod teachings; From 769e9a6a7095fee55f70061bfb6557228c74f696 Mon Sep 17 00:00:00 2001 From: Stefano Volpe Date: Tue, 18 Jun 2024 01:20:41 +0200 Subject: [PATCH 02/11] feat: wip fix date, use config --- Cargo.lock | 544 ++++++++++++++++++++++++++++++++------- Cargo.toml | 3 + src/degrees.rs | 188 -------------- src/degrees/mod.rs | 117 +++++++++ src/degrees/teachings.rs | 84 ++++++ src/degrees/year.rs | 13 + src/main.rs | 18 +- src/teachings.rs | 1 - 8 files changed, 679 insertions(+), 289 deletions(-) delete mode 100644 src/degrees.rs create mode 100644 src/degrees/mod.rs create mode 100644 src/degrees/teachings.rs create mode 100644 src/degrees/year.rs delete mode 100644 src/teachings.rs diff --git a/Cargo.lock b/Cargo.lock index eb2c1cf..a388437 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -30,6 +30,21 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "atomic-waker" version = "1.1.2" @@ -95,9 +110,9 @@ checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" [[package]] name = "cc" -version = "1.0.98" +version = "1.0.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41c270e7540d725e65ac7f1b212ac8ce349719624d7bcff99f8e2e488e8cf03f" +checksum = "96c51067fd44124faa7f870b4b1c969379ad32b2ba805aa959430ceaa384f695" [[package]] name = "cfg-if" @@ -105,6 +120,20 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chrono" +version = "0.4.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "js-sys", + "num-traits", + "wasm-bindgen", + "windows-targets 0.52.5", +] + [[package]] name = "color-eyre" version = "0.6.3" @@ -152,12 +181,15 @@ checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" name = "course-description-merged" version = "0.1.0" dependencies = [ + "chrono", "color-eyre", "eyre", "itertools", "lazy_static", "reqwest", "scraper", + "serde", + "serde_json", "substring", ] @@ -181,18 +213,29 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" dependencies = [ "quote", - "syn 2.0.65", + "syn 2.0.66", ] [[package]] name = "derive_more" -version = "0.99.17" +version = "0.99.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321" +checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.66", +] + +[[package]] +name = "displaydoc" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", ] [[package]] @@ -203,9 +246,9 @@ checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653" [[package]] name = "dtoa-short" -version = "0.3.4" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbaceec3c6e4211c79e7b1800fb9680527106beb2f9c51904a3210c03a448c74" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" dependencies = [ "dtoa", ] @@ -413,12 +456,6 @@ version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" -[[package]] -name = "hermit-abi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" - [[package]] name = "html5ever" version = "0.26.0" @@ -456,12 +493,12 @@ dependencies = [ [[package]] name = "http-body-util" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0475f8b2ac86659c21b64320d5d653f9efe42acd2a4e560073ec61a155a34f1d" +checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" dependencies = [ "bytes", - "futures-core", + "futures-util", "http", "http-body", "pin-project-lite", @@ -469,9 +506,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.8.0" +version = "1.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" +checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9" [[package]] name = "hyper" @@ -493,6 +530,23 @@ dependencies = [ "want", ] +[[package]] +name = "hyper-rustls" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155" +dependencies = [ + "futures-util", + "http", + "hyper", + "hyper-util", + "rustls", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower-service", +] + [[package]] name = "hyper-tls" version = "0.6.0" @@ -511,9 +565,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.3" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa" +checksum = "7b875924a60b96e5d7b9ae7b066540b1dd1cbd90d1828f54c92e02a283351c56" dependencies = [ "bytes", "futures-channel", @@ -529,14 +583,157 @@ dependencies = [ "tracing", ] +[[package]] +name = "iana-time-zone" +version = "0.1.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "icu_collections" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locid" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_locid_transform" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_locid_transform_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_locid_transform_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" + +[[package]] +name = "icu_normalizer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" + +[[package]] +name = "icu_properties" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f8ac670d7422d7f76b32e17a5db556510825b29ec9154f235977c9caba61036" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locid_transform", + "icu_properties_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" + +[[package]] +name = "icu_provider" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_provider_macros", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_provider_macros" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", +] + [[package]] name = "idna" -version = "0.5.0" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +checksum = "4716a3a0933a1d01c2f72450e89596eb51dd34ef3c211ccd875acdf1f8fe47ed" dependencies = [ - "unicode-bidi", - "unicode-normalization", + "icu_normalizer", + "icu_properties", + "smallvec", + "utf8_iter", ] [[package]] @@ -603,6 +800,12 @@ version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +[[package]] +name = "litemap" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704" + [[package]] name = "lock_api" version = "0.4.12" @@ -641,9 +844,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.2" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "mime" @@ -673,11 +876,10 @@ dependencies = [ [[package]] name = "native-tls" -version = "0.2.11" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" +checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" dependencies = [ - "lazy_static", "libc", "log", "openssl", @@ -696,13 +898,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" [[package]] -name = "num_cpus" -version = "1.16.0" +name = "num-traits" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ - "hermit-abi", - "libc", + "autocfg", ] [[package]] @@ -743,7 +944,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.65", + "syn 2.0.66", ] [[package]] @@ -772,9 +973,9 @@ checksum = "c1b04fb49957986fdce4d6ee7a65027d55d4b6d2265e5848bbb507b58ccfdb6f" [[package]] name = "parking_lot" -version = "0.12.2" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e4af0ca4f6caed20e900d564c242b8e5d4903fdacf31d3daf527b66fe6f42fb" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" dependencies = [ "lock_api", "parking_lot_core", @@ -858,7 +1059,7 @@ dependencies = [ "phf_shared 0.11.2", "proc-macro2", "quote", - "syn 2.0.65", + "syn 2.0.66", ] [[package]] @@ -896,7 +1097,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.65", + "syn 2.0.66", ] [[package]] @@ -931,9 +1132,9 @@ checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" [[package]] name = "proc-macro2" -version = "1.0.83" +version = "1.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b33eb56c327dec362a9e55b3ad14f9d2f0904fb5a5b03b513ab5465399e9f43" +checksum = "22244ce15aa966053a896d1accb3a6e68469b97c7f33f284b99f0d576879fc23" dependencies = [ "unicode-ident", ] @@ -979,18 +1180,18 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "469052894dcb553421e483e4209ee581a45100d31b4018de03e5a7ad86374a7e" +checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd" dependencies = [ "bitflags 2.5.0", ] [[package]] name = "reqwest" -version = "0.12.4" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "566cafdd92868e0939d3fb961bd0dc25fcfaaed179291093b3d43e6b3150ea10" +checksum = "c7d6d2a27d57148378eb5e111173f4276ad26340ecc5c49a4a2152167a2d6a37" dependencies = [ "base64", "bytes", @@ -1003,6 +1204,7 @@ dependencies = [ "http-body", "http-body-util", "hyper", + "hyper-rustls", "hyper-tls", "hyper-util", "ipnet", @@ -1029,6 +1231,21 @@ dependencies = [ "winreg", ] +[[package]] +name = "ring" +version = "0.17.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" +dependencies = [ + "cc", + "cfg-if", + "getrandom", + "libc", + "spin", + "untrusted", + "windows-sys 0.52.0", +] + [[package]] name = "rustc-demangle" version = "0.1.24" @@ -1048,6 +1265,19 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rustls" +version = "0.23.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05cff451f60db80f490f3c182b77c35260baace73209e9cdbbe526bfe3a4d402" +dependencies = [ + "once_cell", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + [[package]] name = "rustls-pemfile" version = "2.1.2" @@ -1064,6 +1294,17 @@ version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "976295e77ce332211c0d24d92c0e83e50f5c5f046d11082cea19f3df13a3562d" +[[package]] +name = "rustls-webpki" +version = "0.102.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff448f7e92e913c4b7d4c6d8e4540a1724b319b4152b8aef6d4cf8339712b33e" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "ryu" version = "1.0.18" @@ -1145,22 +1386,22 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.202" +version = "1.0.203" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "226b61a0d411b2ba5ff6d7f73a476ac4f8bb900373459cd00fab8512828ba395" +checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.202" +version = "1.0.203" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6048858004bcff69094cd972ed40a32500f153bd3be9f716b2eed2e8217c4838" +checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" dependencies = [ "proc-macro2", "quote", - "syn 2.0.65", + "syn 2.0.66", ] [[package]] @@ -1235,6 +1476,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -1276,6 +1523,12 @@ dependencies = [ "autocfg", ] +[[package]] +name = "subtle" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" + [[package]] name = "syn" version = "1.0.109" @@ -1289,9 +1542,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.65" +version = "2.0.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2863d96a84c6439701d7a38f9de935ec562c8832cc55d1dde0f513b52fad106" +checksum = "c42f3f41a2de00b01c0aaad383c5a45241efc8b2d1eda5661812fda5f3cdcff5" dependencies = [ "proc-macro2", "quote", @@ -1300,9 +1553,20 @@ dependencies = [ [[package]] name = "sync_wrapper" -version = "0.1.2" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" +checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" + +[[package]] +name = "synstructure" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", +] [[package]] name = "system-configuration" @@ -1359,31 +1623,25 @@ dependencies = [ ] [[package]] -name = "tinyvec" -version = "1.6.0" +name = "tinystr" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" dependencies = [ - "tinyvec_macros", + "displaydoc", + "zerovec", ] -[[package]] -name = "tinyvec_macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" - [[package]] name = "tokio" -version = "1.37.0" +version = "1.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787" +checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" dependencies = [ "backtrace", "bytes", "libc", "mio", - "num_cpus", "pin-project-lite", "socket2", "windows-sys 0.48.0", @@ -1399,6 +1657,17 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-rustls" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" +dependencies = [ + "rustls", + "rustls-pki-types", + "tokio", +] + [[package]] name = "tokio-util" version = "0.7.11" @@ -1425,7 +1694,6 @@ dependencies = [ "tokio", "tower-layer", "tower-service", - "tracing", ] [[package]] @@ -1446,7 +1714,6 @@ version = "0.1.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" dependencies = [ - "log", "pin-project-lite", "tracing-core", ] @@ -1488,12 +1755,6 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" -[[package]] -name = "unicode-bidi" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" - [[package]] name = "unicode-ident" version = "1.0.12" @@ -1501,25 +1762,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] -name = "unicode-normalization" -version = "0.1.23" +name = "unicode-width" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" -dependencies = [ - "tinyvec", -] +checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" [[package]] -name = "unicode-width" -version = "0.1.12" +name = "untrusted" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f5e5f3158ecfd4b8ff6fe086db7c8467a2dfdac97fe420f2b7c4aa97af66d6" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.5.0" +version = "2.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" +checksum = "f7c25da092f0a868cdf09e8674cd3b7ef3a7d92a24253e663a2fb85e2496de56" dependencies = [ "form_urlencoded", "idna", @@ -1532,6 +1790,18 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "valuable" version = "0.1.0" @@ -1586,7 +1856,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.65", + "syn 2.0.66", "wasm-bindgen-shared", ] @@ -1620,7 +1890,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.65", + "syn 2.0.66", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -1641,6 +1911,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets 0.52.5", +] + [[package]] name = "windows-sys" version = "0.48.0" @@ -1790,6 +2069,42 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" + +[[package]] +name = "yoke" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", + "synstructure", +] + [[package]] name = "zerocopy" version = "0.7.34" @@ -1807,5 +2122,54 @@ checksum = "15e934569e47891f7d9411f1a451d947a60e000ab3bd24fbb970f000387d1b3b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.65", + "syn 2.0.66", +] + +[[package]] +name = "zerofrom" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", + "synstructure", +] + +[[package]] +name = "zeroize" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" + +[[package]] +name = "zerovec" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb2cc8827d6c0994478a15c53f374f46fbd41bea663d809b14744bc42e6b109c" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97cf56601ee5052b4417d90c8755c6683473c926039908196cf35d99f893ebe7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", ] diff --git a/Cargo.toml b/Cargo.toml index 1ec8423..ebad377 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,10 +5,13 @@ edition = "2021" [dependencies] +chrono = "0.4.38" color-eyre = "0.6.2" eyre = "0.6.12" itertools = "0.13.0" lazy_static = "1.4.0" reqwest = { version = "0.12", features = ["blocking"] } scraper = "0.19.0" +serde = { version = "1.0.203", features = ["derive"] } +serde_json = "1.0.117" substring = "1.4.5" diff --git a/src/degrees.rs b/src/degrees.rs deleted file mode 100644 index 4822b4b..0000000 --- a/src/degrees.rs +++ /dev/null @@ -1,188 +0,0 @@ -use std::{collections::HashMap, fmt::Write, path::Path}; - -use eyre::{eyre, Result}; -use itertools::Itertools; -use lazy_static::lazy_static; -use scraper::Selector; -use substring::Substring; - -lazy_static! { - static ref TABLE: Selector = scraper::Selector::parse("td.title").unwrap(); - static ref TITLE: Selector = scraper::Selector::parse("div#u-content-intro>h1").unwrap(); - static ref LANG: Selector = scraper::Selector::parse("li.language-en").unwrap(); - static ref DESC: Selector = scraper::Selector::parse("div.description-text").unwrap(); - static ref DESC_END_MARKER: HashMap = [ - ("Numerical Computing".to_string(), "Teaching".to_string()), - ("History of Informatics".to_string(), "Office".to_string()), - ("*".to_string(), "Readings".to_string()) - ] - .into(); - static ref MISSING_TRANSLATIONS: HashMap = [ - ("BASI DI DATI".to_string(), "DATABASES".to_string()), - ( - "INTRODUZIONE ALL'APPRENDIMENTO AUTOMATICO".to_string(), - "Introduction to machine learning".to_string() - ), - ("FONDAMENTI DI".to_string(), "".to_string()), - ( - "Learning outcomes".to_string(), - "=== Learning outcomes".to_string() - ), - ( - "Degree contents".to_string(), - "=== Degree contents".to_string() - ) - ] - .into(); -} - -pub struct Degree { - pub name: &'static str, - pub slug: &'static str, - pub url: &'static str, -} - -pub const DEGREES: &[Degree] = &[ - Degree { - name: "Informatica", - slug: "informatica", - url: "https://corsi.unibo.it/laurea/informatica/insegnamenti/piano/2022/8009/000/000/2022" - }, - Degree { - name: "Ingegneria Informatica", - slug: "ing-informatica", - url: "https://corsi.unibo.it/laurea/IngegneriaInformatica/insegnamenti/piano/2021/9254/000/000/2021" - } -]; - -fn get_eng_url(url: &str) -> Result { - if url.is_empty() { - return Ok("".to_string()); - } - - let res = reqwest::blocking::get(url)?.text()?; - let document = scraper::Html::parse_document(&res); - let mut link_ite = document.select(&LANG).map(|x| x.inner_html()); - - link_ite.next().ok_or(eyre!("Cannot get english url")) -} - -fn get_desc_degree_page(url: &str) -> Result { - let eng_url_temp = get_eng_url(url)?; - - // ignore language requirements + pseudocourses with no webpages - if eng_url_temp.contains("26338") || eng_url_temp.is_empty() { - return Ok("".to_string()); - } - - let start = eng_url_temp.find("http").unwrap_or(0); - let tmp = eng_url_temp.substring(start, eng_url_temp.len()); - let end = tmp.find('\"').unwrap_or(0); - let degree_url = tmp.substring(0, end); - - let eng_page = reqwest::blocking::get(degree_url)?.text()?; - let document = scraper::Html::parse_document(&eng_page); - - let degree_title = document - .select(&TITLE) - .next() - .ok_or(eyre!("Cannot parse degree title"))? - .text() - .join(""); - - let full_description = document - .select(&DESC) - .next() - .ok_or(eyre!("Cannot parse degree description"))? - .text() - .join(""); - - let i = full_description - .find("Learning outcomes") - .unwrap_or(full_description.len()); - - let mut f: Option = DESC_END_MARKER - .get("*") - .and_then(|marker| full_description.find(marker)); - - for (pattern, marker) in DESC_END_MARKER.iter() { - if degree_title.contains(pattern.as_str()) { - f = full_description - .find(marker) - .or(Some(full_description.len())); - break; - } - } - - let filtered_description = full_description - .substring( - i, - f.ok_or(eyre!( - "No description end marker defined for this page content" - ))? - 2, - ) - .split('\n') - .map(|item| item.trim()) - .filter(|item| !item.is_empty()) - .join("\n\n"); - - Ok(format!( - "\n== {}[{}]\n{}", - degree_url, - degree_title.as_str(), - filtered_description.trim() - )) -} - -pub fn analyze_degree( - degree_name: &str, - output_file: &Path, - teachings_url: &str, -) -> Result<(), eyre::ErrReport> { - let res = reqwest::blocking::get(teachings_url)?.text()?; - - let document = scraper::Html::parse_document(&res); - let title_list = document.select(&TABLE); - - let mut buf = format!("= {degree_name}\n\n"); - - for item in title_list { - let mut entry_doc = "".to_string(); - - let a_el = item - .children() - .filter_map(|f| f.value().as_element()) - .find(|r| r.name() == "a") - .map(|a_el| a_el.attr("href")) - .flatten(); - - let teaching_url = match a_el { - Some(a) => a, - None => { - eprintln!("Cannot parse an element: {}", item.text().join("").trim()); - continue; - } - }; - - print!("Visiting {}", teaching_url); - let teaching_desc = match get_desc_degree_page(teaching_url) { - Ok(desc) => desc, - Err(e) => { - eprintln!("Cannot get teaching description: {}", e); - continue; - } - }; - - entry_doc += "\n"; - entry_doc += teaching_desc.as_str(); - - for (source, replacement) in MISSING_TRANSLATIONS.iter() { - entry_doc = entry_doc.replace(source, replacement); - } - - buf.write_str(&entry_doc)?; - println!("\t✓"); - } - std::fs::write(output_file, buf)?; - Ok(()) -} diff --git a/src/degrees/mod.rs b/src/degrees/mod.rs new file mode 100644 index 0000000..29dd396 --- /dev/null +++ b/src/degrees/mod.rs @@ -0,0 +1,117 @@ +use std::{fmt::Write, fs, path::Path}; + +use itertools::Itertools; +use scraper::Selector; + +pub mod teachings; +pub mod year; + +lazy_static::lazy_static! { + static ref TABLE: Selector = Selector::parse("td.title").unwrap(); + static ref MISSING_TRANSLATIONS: std::collections::HashMap = [ + ("BASI DI DATI".to_string(), "DATABASES".to_string()), + ( + "INTRODUZIONE ALL'APPRENDIMENTO AUTOMATICO".to_string(), + "Introduction to machine learning".to_string() + ), + ("FONDAMENTI DI".to_string(), "".to_string()), + ( + "Learning outcomes".to_string(), + "=== Learning outcomes".to_string() + ), + ( + "Teaching contents".to_string(), + "=== Teaching contents".to_string() + ) + ] + .into(); +} + +#[derive(serde::Deserialize, Debug, Clone)] +struct Predegree { + id: String, + name: String, + code: String, +} + +pub struct Degree { + pub name: String, + pub slug: String, + pub url: String, +} + +const DEGREES_PATH: &str = "config/degrees.json"; + +fn parse_degree(predegree: &Predegree, academic_year: u32) -> Degree { + let Predegree { name, id, code } = predegree; + let unibo_slug = name.replace("", ""); + Degree { + name: name.to_string(), + slug: id.to_string(), + url: format!("https://corsi.unibo.it/laurea/{unibo_slug}/insegnamenti/piano/{academic_year}/{code}/000/{academic_year}") + } +} + +fn to_degrees(predegrees: Vec) -> Vec { + let academic_year = year::current_academic_year(); + predegrees + .iter() + .map(|predegree| parse_degree(predegree, academic_year)) + .collect() +} + +pub fn analyze_degree( + degree_name: &str, + output_file: &Path, + teachings_url: &str, +) -> eyre::Result<(), eyre::ErrReport> { + let res = reqwest::blocking::get(teachings_url)?.text()?; + let document = scraper::Html::parse_document(&res); + let title_list = document.select(&TABLE); + let mut buf = format!("= {degree_name}\n\n"); + for item in title_list { + let mut entry_doc = "".to_string(); + let a_el = item + .children() + .filter_map(|f| f.value().as_element()) + .find(|r| r.name() == "a") + .map(|a_el| a_el.attr("href")) + .flatten(); + let teaching_url = match a_el { + Some(a) => a, + None => { + eprintln!("Cannot parse an element: {}", item.text().join("").trim()); + continue; + } + }; + print!("Visiting {}", teaching_url); + let teaching_desc = match teachings::get_desc_teaching_page(teaching_url) { + Ok(desc) => desc, + Err(e) => { + eprintln!("Cannot get teaching description: {}", e); + continue; + } + }; + entry_doc += "\n"; + entry_doc += teaching_desc.as_str(); + for (source, replacement) in MISSING_TRANSLATIONS.iter() { + entry_doc = entry_doc.replace(source, replacement); + } + buf.write_str(&entry_doc)?; + println!("\t✓"); + } + fs::write(output_file, buf)?; + Ok(()) +} + +pub fn degrees() -> Vec { + let file = match fs::File::open(DEGREES_PATH) { + Ok(file) => file, + Err(error) => panic!("Reading {DEGREES_PATH:?}: {error:?}"), + }; + let json: Vec = match serde_json::from_reader(file) { + Ok(json) => json, + Err(error) => panic!("Parsing {DEGREES_PATH}: {error:?}"), + }; + to_degrees(json) +} diff --git a/src/degrees/teachings.rs b/src/degrees/teachings.rs new file mode 100644 index 0000000..1099c9a --- /dev/null +++ b/src/degrees/teachings.rs @@ -0,0 +1,84 @@ +use eyre::eyre; +use eyre::Result; +use itertools::Itertools; +use reqwest::blocking; +use scraper::Selector; +use substring::Substring; + +lazy_static::lazy_static! { + static ref TITLE: Selector = Selector::parse("div#u-content-intro>h1").unwrap(); + static ref LANG: Selector = Selector::parse("li.language-en").unwrap(); + static ref DESC: Selector = Selector::parse("div.description-text").unwrap(); + static ref DESC_END_MARKER: std::collections::HashMap = [ + ("Numerical Computing".to_string(), "Teaching".to_string()), + ("History of Informatics".to_string(), "Office".to_string()), + ("*".to_string(), "Readings".to_string()) + ] + .into(); +} + +fn get_eng_url(url: &str) -> Result { + if url.is_empty() { + return Ok("".to_string()); + } + let res = blocking::get(url)?.text()?; + let document = scraper::Html::parse_document(&res); + let mut link_ite = document.select(&LANG).map(|x| x.inner_html()); + link_ite.next().ok_or(eyre!("Cannot get english url")) +} + +pub fn get_desc_teaching_page(url: &str) -> Result { + let eng_url_temp = match get_eng_url(url) { + Ok(url) => url, + Err(e) => return Err(eyre!(e.to_string())), // interniships, thesis... + }; + let start = eng_url_temp.find("http").unwrap_or(0); + let tmp = eng_url_temp.substring(start, eng_url_temp.len()); + let end = tmp.find('\"').unwrap_or(0); + let teaching_url = tmp.substring(0, end); + let eng_page = blocking::get(teaching_url)?.text()?; + let document = scraper::Html::parse_document(&eng_page); + let teaching_title = document + .select(&TITLE) + .next() + .ok_or(eyre!("Cannot parse teaching title"))? + .text() + .join(""); + let full_description = document + .select(&DESC) + .next() + .ok_or(eyre!("Cannot parse teaching description"))? + .text() + .join(""); + let i = full_description + .find("Learning outcomes") + .unwrap_or(full_description.len()); + let mut f: Option = DESC_END_MARKER + .get("*") + .and_then(|marker| full_description.find(marker)); + for (pattern, marker) in DESC_END_MARKER.iter() { + if teaching_title.contains(pattern.as_str()) { + f = full_description + .find(marker) + .or(Some(full_description.len())); + break; + } + } + let filtered_description = full_description + .substring( + i, + f.ok_or(eyre!( + "No description end marker defined for this page content" + ))? - 2, + ) + .split('\n') + .map(|item| item.trim()) + .filter(|item| !item.is_empty()) + .join("\n\n"); + Ok(format!( + "\n== {}[{}]\n{}", + teaching_url, + teaching_title.as_str(), + filtered_description.trim() + )) +} diff --git a/src/degrees/year.rs b/src/degrees/year.rs new file mode 100644 index 0000000..9d3dd01 --- /dev/null +++ b/src/degrees/year.rs @@ -0,0 +1,13 @@ +use chrono::Datelike; + +const SEPTEMBER: u32 = 9; + +pub fn current_academic_year() -> u32 { + let n = chrono::prelude::Local::now(); + let (_, y) = n.year_ce(); + if n.month() >= SEPTEMBER { + y + } else { + y - 1 + } +} diff --git a/src/main.rs b/src/main.rs index e16c6fa..ba128bf 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,24 +1,22 @@ -use std::{fmt::Write, fs, path::Path}; +use std::{fmt::Write, fs}; pub mod degrees; -use degrees::{analyze_degree, Degree, DEGREES}; fn main() -> Result<(), eyre::Report> { color_eyre::install()?; - - let output_dir = Path::new("output"); + let output_dir = std::path::Path::new("output"); if !output_dir.exists() { fs::create_dir(output_dir)?; } - let mut index = "= Index\n\n".to_owned(); - - for Degree { slug, name, url } in DEGREES { - analyze_degree(name, &output_dir.join(format!("degree-{}.adoc", slug)), url)?; + for degrees::Degree { slug, name, url } in degrees::degrees() { + degrees::analyze_degree( + &name, + &output_dir.join(format!("degree-{}.adoc", slug)), + &url, + )?; write!(index, "* xref:degree-{}.adoc[{}]\n", slug, name)?; } - fs::write(output_dir.join("index.adoc"), index)?; - Ok(()) } diff --git a/src/teachings.rs b/src/teachings.rs deleted file mode 100644 index 0a6023e..0000000 --- a/src/teachings.rs +++ /dev/null @@ -1 +0,0 @@ -mod teachings; From 19926ff4ae67ffbf43d0a92b1285f2a0ea993e8d Mon Sep 17 00:00:00 2001 From: Stefano Volpe Date: Tue, 18 Jun 2024 02:03:28 +0200 Subject: [PATCH 03/11] chore: clippy, dependabot, actions --- .github/dependabot.yml | 4 ++++ .github/workflows/pages.yml | 16 ++++++++++++++++ src/degrees/mod.rs | 5 ++--- src/main.rs | 2 +- 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index ca6709c..c347e0f 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -15,3 +15,7 @@ updates: - dependency-name: "json/config" schedule: interval: "daily" + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 2f8de40..37385d1 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -42,6 +42,22 @@ jobs: target/ key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + - uses: actions-rs/cargo@v1 + with: + command: clippy +<<<<<<< Updated upstream + + - uses: actions-rs/cargo@v1 + with: + command: fix + + - uses: actions-rs/cargo@v1 + with: + command: fmt +======= + args: --release +>>>>>>> Stashed changes + - uses: actions-rs/cargo@v1 with: command: run diff --git a/src/degrees/mod.rs b/src/degrees/mod.rs index 29dd396..8179d24 100644 --- a/src/degrees/mod.rs +++ b/src/degrees/mod.rs @@ -44,7 +44,7 @@ const DEGREES_PATH: &str = "config/degrees.json"; fn parse_degree(predegree: &Predegree, academic_year: u32) -> Degree { let Predegree { name, id, code } = predegree; - let unibo_slug = name.replace("", ""); + let unibo_slug = name.replace(" ", ""); Degree { name: name.to_string(), slug: id.to_string(), @@ -75,8 +75,7 @@ pub fn analyze_degree( .children() .filter_map(|f| f.value().as_element()) .find(|r| r.name() == "a") - .map(|a_el| a_el.attr("href")) - .flatten(); + .and_then(|a_el| a_el.attr("href")); let teaching_url = match a_el { Some(a) => a, None => { diff --git a/src/main.rs b/src/main.rs index ba128bf..65d2cb8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,7 +15,7 @@ fn main() -> Result<(), eyre::Report> { &output_dir.join(format!("degree-{}.adoc", slug)), &url, )?; - write!(index, "* xref:degree-{}.adoc[{}]\n", slug, name)?; + writeln!(index, "* xref:degree-{}.adoc[{}]", slug, name)?; } fs::write(output_dir.join("index.adoc"), index)?; Ok(()) From 88247d46f68993c07af46a9c2add4de1560a9d73 Mon Sep 17 00:00:00 2001 From: Stefano Volpe Date: Tue, 18 Jun 2024 02:12:11 +0200 Subject: [PATCH 04/11] chore: clippy, actions, more string substitution --- .github/workflows/pages.yml | 4 ---- src/degrees/mod.rs | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 37385d1..e3ba353 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -45,7 +45,6 @@ jobs: - uses: actions-rs/cargo@v1 with: command: clippy -<<<<<<< Updated upstream - uses: actions-rs/cargo@v1 with: @@ -54,9 +53,6 @@ jobs: - uses: actions-rs/cargo@v1 with: command: fmt -======= - args: --release ->>>>>>> Stashed changes - uses: actions-rs/cargo@v1 with: diff --git a/src/degrees/mod.rs b/src/degrees/mod.rs index 8179d24..8497c72 100644 --- a/src/degrees/mod.rs +++ b/src/degrees/mod.rs @@ -44,7 +44,7 @@ const DEGREES_PATH: &str = "config/degrees.json"; fn parse_degree(predegree: &Predegree, academic_year: u32) -> Degree { let Predegree { name, id, code } = predegree; - let unibo_slug = name.replace(" ", ""); + let unibo_slug = name.replace(" e ", "").replace(' ', ""); Degree { name: name.to_string(), slug: id.to_string(), From d1b4d06321b34aa6d53def6cdbf7eefe1b39830b Mon Sep 17 00:00:00 2001 From: Stefano Volpe Date: Tue, 18 Jun 2024 03:35:14 +0200 Subject: [PATCH 05/11] feat: logging --- Cargo.lock | 138 ++++++++++++++++++++++++++++++++++++++++++++- Cargo.toml | 4 +- config | 2 +- src/degrees/mod.rs | 67 ++++++++++++++++------ src/main.rs | 42 ++++++++++---- 5 files changed, 220 insertions(+), 33 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a388437..4b403d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -30,6 +30,15 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + [[package]] name = "android-tzdata" version = "0.1.1" @@ -45,6 +54,55 @@ dependencies = [ "libc", ] +[[package]] +name = "anstream" +version = "0.6.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" + +[[package]] +name = "anstyle-parse" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391" +dependencies = [ + "windows-sys 0.52.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" +dependencies = [ + "anstyle", + "windows-sys 0.52.0", +] + [[package]] name = "atomic-waker" version = "1.1.2" @@ -161,6 +219,12 @@ dependencies = [ "tracing-error", ] +[[package]] +name = "colorchoice" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" + [[package]] name = "core-foundation" version = "0.9.4" @@ -179,13 +243,15 @@ checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" [[package]] name = "course-description-merged" -version = "0.1.0" +version = "1.0.0" dependencies = [ "chrono", "color-eyre", + "env_logger", "eyre", "itertools", "lazy_static", + "log", "reqwest", "scraper", "serde", @@ -274,6 +340,29 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "env_filter" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a009aa4810eb158359dda09d0c87378e4bbb89b5a801f016885a4707ba24f7ea" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b35839ba51819680ba087cd351788c9a3c476841207e0b8cee0b04722343b9" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "humantime", + "log", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -510,6 +599,12 @@ version = "1.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9" +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + [[package]] name = "hyper" version = "1.3.1" @@ -758,6 +853,12 @@ version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" +[[package]] +name = "is_terminal_polyfill" +version = "1.70.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" + [[package]] name = "itertools" version = "0.13.0" @@ -1187,6 +1288,35 @@ dependencies = [ "bitflags 2.5.0", ] +[[package]] +name = "regex" +version = "1.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" + [[package]] name = "reqwest" version = "0.12.5" @@ -1802,6 +1932,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "valuable" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index ebad377..ce4295c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,15 +1,17 @@ [package] name = "course-description-merged" -version = "0.1.0" +version = "1.0.0" edition = "2021" [dependencies] chrono = "0.4.38" color-eyre = "0.6.2" +env_logger = "0.11.3" eyre = "0.6.12" itertools = "0.13.0" lazy_static = "1.4.0" +log = "0.4.21" reqwest = { version = "0.12", features = ["blocking"] } scraper = "0.19.0" serde = { version = "1.0.203", features = ["derive"] } diff --git a/config b/config index f3a84ec..6819e7e 160000 --- a/config +++ b/config @@ -1 +1 @@ -Subproject commit f3a84ec3afeaaa18d620384ff17cc58a665e9ead +Subproject commit 6819e7e6316e7e80b3a20abf2544fa56bc853756 diff --git a/src/degrees/mod.rs b/src/degrees/mod.rs index 8497c72..2869037 100644 --- a/src/degrees/mod.rs +++ b/src/degrees/mod.rs @@ -1,6 +1,7 @@ use std::{fmt::Write, fs, path::Path}; use itertools::Itertools; +use log::{error, info, warn}; use scraper::Selector; pub mod teachings; @@ -60,13 +61,30 @@ fn to_degrees(predegrees: Vec) -> Vec { .collect() } -pub fn analyze_degree( - degree_name: &str, - output_file: &Path, - teachings_url: &str, -) -> eyre::Result<(), eyre::ErrReport> { - let res = reqwest::blocking::get(teachings_url)?.text()?; - let document = scraper::Html::parse_document(&res); +pub fn analyze_degree(degree_name: &str, output_file: &Path, teachings_url: &str) -> Option<()> { + info!("{degree_name} ({teachings_url})"); + let res = match reqwest::blocking::get(teachings_url) { + Ok(res) => res, + Err(e) => { + error!("\t{e:?}"); + return None; + } + }; + let res2 = match res.error_for_status() { + Ok(res2) => res2, + Err(e) => { + error!("\t{e:?}"); + return None; + } + }; + let text = match res2.text() { + Ok(text) => text, + Err(e) => { + error!("\t{e:?}"); + return None; + } + }; + let document = scraper::Html::parse_document(&text); let title_list = document.select(&TABLE); let mut buf = format!("= {degree_name}\n\n"); for item in title_list { @@ -76,18 +94,20 @@ pub fn analyze_degree( .filter_map(|f| f.value().as_element()) .find(|r| r.name() == "a") .and_then(|a_el| a_el.attr("href")); + let temp_name = item.text().join(""); + let name = temp_name.trim(); let teaching_url = match a_el { Some(a) => a, None => { - eprintln!("Cannot parse an element: {}", item.text().join("").trim()); + warn!("\tMissing link: {name}"); continue; } }; - print!("Visiting {}", teaching_url); + info!("\tVisiting {name}"); let teaching_desc = match teachings::get_desc_teaching_page(teaching_url) { Ok(desc) => desc, Err(e) => { - eprintln!("Cannot get teaching description: {}", e); + error!("\t\tCannot get description: {e:?}"); continue; } }; @@ -96,21 +116,32 @@ pub fn analyze_degree( for (source, replacement) in MISSING_TRANSLATIONS.iter() { entry_doc = entry_doc.replace(source, replacement); } - buf.write_str(&entry_doc)?; - println!("\t✓"); + if let Err(e) = buf.write_str(&entry_doc) { + error!("\t\tCannot append: {e:?}"); + return None; + }; } - fs::write(output_file, buf)?; - Ok(()) + if let Err(e) = fs::write(output_file, buf) { + error!("\t\tCannot write: {e:?}"); + return None; + }; + Some(()) } -pub fn degrees() -> Vec { +pub fn degrees() -> Option> { let file = match fs::File::open(DEGREES_PATH) { Ok(file) => file, - Err(error) => panic!("Reading {DEGREES_PATH:?}: {error:?}"), + Err(error) => { + error!("Reading {DEGREES_PATH:?}: {error:?}"); + return None; + } }; let json: Vec = match serde_json::from_reader(file) { Ok(json) => json, - Err(error) => panic!("Parsing {DEGREES_PATH}: {error:?}"), + Err(error) => { + error!("Parsing {DEGREES_PATH}: {error:?}"); + return None; + } }; - to_degrees(json) + Some(to_degrees(json)) } diff --git a/src/main.rs b/src/main.rs index 65d2cb8..e45fb3c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,22 +1,40 @@ +use degrees::degrees; +use log::error; use std::{fmt::Write, fs}; pub mod degrees; -fn main() -> Result<(), eyre::Report> { - color_eyre::install()?; +fn main() -> () { + let env = env_logger::Env::default().default_filter_or("info"); + env_logger::Builder::from_env(env).init(); + if let Err(e) = color_eyre::install() { + error!("Eyre setup: {e}"); + return; + }; let output_dir = std::path::Path::new("output"); if !output_dir.exists() { - fs::create_dir(output_dir)?; + if let Err(e) = fs::create_dir(output_dir) { + error!("Output dir creation: {e}"); + return; + }; } let mut index = "= Index\n\n".to_owned(); - for degrees::Degree { slug, name, url } in degrees::degrees() { - degrees::analyze_degree( - &name, - &output_dir.join(format!("degree-{}.adoc", slug)), - &url, - )?; - writeln!(index, "* xref:degree-{}.adoc[{}]", slug, name)?; + if let Some(deg) = degrees() { + for degrees::Degree { slug, name, url } in deg { + degrees::analyze_degree( + &name, + &output_dir.join(format!("degree-{}.adoc", slug)), + &url, + ); + if let Err(e) = writeln!(index, "* xref:degree-{}.adoc[{}]", slug, name) { + error!("Could not append {name}: {e}"); + }; + } + } else { + error!("Could not load degrees"); + return; } - fs::write(output_dir.join("index.adoc"), index)?; - Ok(()) + if let Err(e) = fs::write(output_dir.join("index.adoc"), index) { + error!("Could not write index: {e}") + }; } From 571733129e5e64f1d90352bac417936f6fc4543b Mon Sep 17 00:00:00 2001 From: Stefano Volpe Date: Tue, 18 Jun 2024 04:21:10 +0200 Subject: [PATCH 06/11] feat: wip finetune degrees url search --- Cargo.lock | 1 + Cargo.toml | 1 + src/degrees/mod.rs | 102 ++++++++++++++++++++++++++------------------- src/main.rs | 14 +++---- 4 files changed, 65 insertions(+), 53 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4b403d5..ed6e28f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -252,6 +252,7 @@ dependencies = [ "itertools", "lazy_static", "log", + "regex", "reqwest", "scraper", "serde", diff --git a/Cargo.toml b/Cargo.toml index ce4295c..1971893 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ eyre = "0.6.12" itertools = "0.13.0" lazy_static = "1.4.0" log = "0.4.21" +regex = "1.10.5" reqwest = { version = "0.12", features = ["blocking"] } scraper = "0.19.0" serde = { version = "1.0.203", features = ["derive"] } diff --git a/src/degrees/mod.rs b/src/degrees/mod.rs index 2869037..12ccf5f 100644 --- a/src/degrees/mod.rs +++ b/src/degrees/mod.rs @@ -43,27 +43,41 @@ pub struct Degree { const DEGREES_PATH: &str = "config/degrees.json"; -fn parse_degree(predegree: &Predegree, academic_year: u32) -> Degree { +fn parse_degree(predegree: &Predegree, academic_year: u32) -> Option { let Predegree { name, id, code } = predegree; - let unibo_slug = name.replace(" e ", "").replace(' ', ""); - Degree { + if name.is_empty() || id.is_empty() || code.is_empty() { + return None; + } + let unibo_slug = regex::Regex::new(r" (((e|per il) )|Magistrale)") + .unwrap() + .replace_all(name, "") + .to_string() + .to_ascii_lowercase() + .replace(' ', ""); + let degree_type = match name.find("Magistrale") { + Some(_) => "magistrale", + None => "laurea", + }; + Some(Degree { name: name.to_string(), slug: id.to_string(), - url: format!("https://corsi.unibo.it/laurea/{unibo_slug}/insegnamenti/piano/{academic_year}/{code}/000/{academic_year}") - } + url: format!("https://corsi.unibo.it/{degree_type}/{unibo_slug}/insegnamenti/piano/{academic_year}/{code}/000/{academic_year}") + }) } fn to_degrees(predegrees: Vec) -> Vec { let academic_year = year::current_academic_year(); predegrees .iter() - .map(|predegree| parse_degree(predegree, academic_year)) + .filter_map(|predegree| parse_degree(predegree, academic_year)) .collect() } -pub fn analyze_degree(degree_name: &str, output_file: &Path, teachings_url: &str) -> Option<()> { - info!("{degree_name} ({teachings_url})"); - let res = match reqwest::blocking::get(teachings_url) { +pub fn analyze_degree(degree: &Degree, output_dir: &Path) -> Option<()> { + let Degree { slug, name, url } = degree; + let output_file = output_dir.join(format!("degree-{slug}.adoc")); + info!("{name} ({url})"); + let res = match reqwest::blocking::get(url) { Ok(res) => res, Err(e) => { error!("\t{e:?}"); @@ -86,41 +100,41 @@ pub fn analyze_degree(degree_name: &str, output_file: &Path, teachings_url: &str }; let document = scraper::Html::parse_document(&text); let title_list = document.select(&TABLE); - let mut buf = format!("= {degree_name}\n\n"); - for item in title_list { - let mut entry_doc = "".to_string(); - let a_el = item - .children() - .filter_map(|f| f.value().as_element()) - .find(|r| r.name() == "a") - .and_then(|a_el| a_el.attr("href")); - let temp_name = item.text().join(""); - let name = temp_name.trim(); - let teaching_url = match a_el { - Some(a) => a, - None => { - warn!("\tMissing link: {name}"); - continue; - } - }; - info!("\tVisiting {name}"); - let teaching_desc = match teachings::get_desc_teaching_page(teaching_url) { - Ok(desc) => desc, - Err(e) => { - error!("\t\tCannot get description: {e:?}"); - continue; - } - }; - entry_doc += "\n"; - entry_doc += teaching_desc.as_str(); - for (source, replacement) in MISSING_TRANSLATIONS.iter() { - entry_doc = entry_doc.replace(source, replacement); - } - if let Err(e) = buf.write_str(&entry_doc) { - error!("\t\tCannot append: {e:?}"); - return None; - }; - } + let mut buf = format!("= {name}\n\n"); + //for item in title_list { + // let mut entry_doc = "".to_string(); + // let a_el = item + // .children() + // .filter_map(|f| f.value().as_element()) + // .find(|r| r.name() == "a") + // .and_then(|a_el| a_el.attr("href")); + // let temp_name = item.text().join(""); + // let name = temp_name.trim(); + // let teaching_url = match a_el { + // Some(a) => a, + // None => { + // warn!("\tMissing link: {name}"); + // continue; + // } + // }; + // info!("\tVisiting {name}"); + // let teaching_desc = match teachings::get_desc_teaching_page(teaching_url) { + // Ok(desc) => desc, + // Err(e) => { + // error!("\t\tCannot get description: {e:?}"); + // continue; + // } + // }; + // entry_doc += "\n"; + // entry_doc += teaching_desc.as_str(); + // for (source, replacement) in MISSING_TRANSLATIONS.iter() { + // entry_doc = entry_doc.replace(source, replacement); + // } + // if let Err(e) = buf.write_str(&entry_doc) { + // error!("\t\tCannot append: {e:?}"); + // return None; + // }; + //} if let Err(e) = fs::write(output_file, buf) { error!("\t\tCannot write: {e:?}"); return None; diff --git a/src/main.rs b/src/main.rs index e45fb3c..7f5cfcb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,7 +4,7 @@ use std::{fmt::Write, fs}; pub mod degrees; -fn main() -> () { +fn main() { let env = env_logger::Env::default().default_filter_or("info"); env_logger::Builder::from_env(env).init(); if let Err(e) = color_eyre::install() { @@ -20,14 +20,10 @@ fn main() -> () { } let mut index = "= Index\n\n".to_owned(); if let Some(deg) = degrees() { - for degrees::Degree { slug, name, url } in deg { - degrees::analyze_degree( - &name, - &output_dir.join(format!("degree-{}.adoc", slug)), - &url, - ); - if let Err(e) = writeln!(index, "* xref:degree-{}.adoc[{}]", slug, name) { - error!("Could not append {name}: {e}"); + for d in deg { + degrees::analyze_degree(&d, &output_dir); + if let Err(e) = writeln!(index, "* xref:degree-{}.adoc[{}]", d.slug, d.name) { + error!("Could not append {}: {}", d.name, e); }; } } else { From b46a59d779fbe0917a864ce56f9bce13d4b53365 Mon Sep 17 00:00:00 2001 From: Stefano Volpe Date: Tue, 18 Jun 2024 05:10:16 +0200 Subject: [PATCH 07/11] feat: before testing --- config | 2 +- src/degrees/mod.rs | 99 +++++++++++++++++++++++++--------------------- 2 files changed, 56 insertions(+), 45 deletions(-) diff --git a/config b/config index 6819e7e..4633c04 160000 --- a/config +++ b/config @@ -1 +1 @@ -Subproject commit 6819e7e6316e7e80b3a20abf2544fa56bc853756 +Subproject commit 4633c04acbb912c49ae1d337cff5d33e3622f1f4 diff --git a/src/degrees/mod.rs b/src/degrees/mod.rs index 12ccf5f..ecf5e48 100644 --- a/src/degrees/mod.rs +++ b/src/degrees/mod.rs @@ -43,20 +43,31 @@ pub struct Degree { const DEGREES_PATH: &str = "config/degrees.json"; +fn to_lowercase_maybe(s: String, b: bool) -> String { + if b { + return s.to_lowercase(); + } + s +} + fn parse_degree(predegree: &Predegree, academic_year: u32) -> Option { let Predegree { name, id, code } = predegree; if name.is_empty() || id.is_empty() || code.is_empty() { return None; } - let unibo_slug = regex::Regex::new(r" (((e|per il) )|Magistrale)") - .unwrap() - .replace_all(name, "") - .to_string() - .to_ascii_lowercase() - .replace(' ', ""); - let degree_type = match name.find("Magistrale") { - Some(_) => "magistrale", - None => "laurea", + let unibo_slug = to_lowercase_maybe( + regex::Regex::new(r"( (e|per il|in) )|Magistrale|Master") + .unwrap() + .replace_all(name, "") + .to_string(), + !code.eq("9254/000"), + ) + // AI's slug is kebab-case + .replace(' ', if code.eq("9063/000") { "-" } else { "" }); + let degree_type = if name.find("Magistrale").is_some() || name.find("Master").is_some() { + "magistrale" + } else { + "laurea" }; Some(Degree { name: name.to_string(), @@ -76,7 +87,7 @@ fn to_degrees(predegrees: Vec) -> Vec { pub fn analyze_degree(degree: &Degree, output_dir: &Path) -> Option<()> { let Degree { slug, name, url } = degree; let output_file = output_dir.join(format!("degree-{slug}.adoc")); - info!("{name} ({url})"); + info!("{name} [{url}]"); let res = match reqwest::blocking::get(url) { Ok(res) => res, Err(e) => { @@ -101,40 +112,40 @@ pub fn analyze_degree(degree: &Degree, output_dir: &Path) -> Option<()> { let document = scraper::Html::parse_document(&text); let title_list = document.select(&TABLE); let mut buf = format!("= {name}\n\n"); - //for item in title_list { - // let mut entry_doc = "".to_string(); - // let a_el = item - // .children() - // .filter_map(|f| f.value().as_element()) - // .find(|r| r.name() == "a") - // .and_then(|a_el| a_el.attr("href")); - // let temp_name = item.text().join(""); - // let name = temp_name.trim(); - // let teaching_url = match a_el { - // Some(a) => a, - // None => { - // warn!("\tMissing link: {name}"); - // continue; - // } - // }; - // info!("\tVisiting {name}"); - // let teaching_desc = match teachings::get_desc_teaching_page(teaching_url) { - // Ok(desc) => desc, - // Err(e) => { - // error!("\t\tCannot get description: {e:?}"); - // continue; - // } - // }; - // entry_doc += "\n"; - // entry_doc += teaching_desc.as_str(); - // for (source, replacement) in MISSING_TRANSLATIONS.iter() { - // entry_doc = entry_doc.replace(source, replacement); - // } - // if let Err(e) = buf.write_str(&entry_doc) { - // error!("\t\tCannot append: {e:?}"); - // return None; - // }; - //} + for item in title_list { + let mut entry_doc = "".to_string(); + let a_el = item + .children() + .filter_map(|f| f.value().as_element()) + .find(|r| r.name() == "a") + .and_then(|a_el| a_el.attr("href")); + let temp_name = item.text().join(""); + let name = temp_name.trim(); + let teaching_url = match a_el { + Some(a) => a, + None => { + warn!("\tMissing link: {name}"); + continue; + } + }; + info!("\tVisiting {name}"); + let teaching_desc = match teachings::get_desc_teaching_page(teaching_url) { + Ok(desc) => desc, + Err(e) => { + error!("\t\tCannot get description: {e:?}"); + continue; + } + }; + entry_doc += "\n"; + entry_doc += teaching_desc.as_str(); + for (source, replacement) in MISSING_TRANSLATIONS.iter() { + entry_doc = entry_doc.replace(source, replacement); + } + if let Err(e) = buf.write_str(&entry_doc) { + error!("\t\tCannot append: {e:?}"); + return None; + }; + } if let Err(e) = fs::write(output_file, buf) { error!("\t\tCannot write: {e:?}"); return None; From db27bf8afa74fa1d27648e26c38d9c2970a145fd Mon Sep 17 00:00:00 2001 From: Stefano Volpe Date: Tue, 18 Jun 2024 05:18:05 +0200 Subject: [PATCH 08/11] fix: ci/cd submodule recurse --- .github/workflows/pages.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index e3ba353..0d3d679 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -29,6 +29,8 @@ jobs: name: Run project steps: - uses: actions/checkout@v3 + with: + submodules: recursive - uses: actions-rs/toolchain@v1 with: toolchain: stable From 681c284d4b28dfa56a33c0bc212816ee6afba659 Mon Sep 17 00:00:00 2001 From: Stefano Volpe Date: Tue, 18 Jun 2024 05:32:31 +0200 Subject: [PATCH 09/11] chore: clippy --- Cargo.lock | 4 ++-- src/degrees/mod.rs | 2 +- src/main.rs | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ed6e28f..2b3a5be 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -958,9 +958,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87dfd01fe195c66b572b37921ad8803d010623c0aca821bea2302239d155cdae" +checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" dependencies = [ "adler", ] diff --git a/src/degrees/mod.rs b/src/degrees/mod.rs index ecf5e48..6f1257b 100644 --- a/src/degrees/mod.rs +++ b/src/degrees/mod.rs @@ -64,7 +64,7 @@ fn parse_degree(predegree: &Predegree, academic_year: u32) -> Option { ) // AI's slug is kebab-case .replace(' ', if code.eq("9063/000") { "-" } else { "" }); - let degree_type = if name.find("Magistrale").is_some() || name.find("Master").is_some() { + let degree_type = if name.contains("Magistrale") || name.contains("Master") { "magistrale" } else { "laurea" diff --git a/src/main.rs b/src/main.rs index 7f5cfcb..8d6d376 100644 --- a/src/main.rs +++ b/src/main.rs @@ -21,7 +21,7 @@ fn main() { let mut index = "= Index\n\n".to_owned(); if let Some(deg) = degrees() { for d in deg { - degrees::analyze_degree(&d, &output_dir); + degrees::analyze_degree(&d, output_dir); if let Err(e) = writeln!(index, "* xref:degree-{}.adoc[{}]", d.slug, d.name) { error!("Could not append {}: {}", d.name, e); }; From 4087f679a9a5ee439268205d0803fb01d67e7453 Mon Sep 17 00:00:00 2001 From: Stefano Volpe Date: Tue, 18 Jun 2024 05:47:41 +0200 Subject: [PATCH 10/11] chore: update submodule --- config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config b/config index 4633c04..70107e5 160000 --- a/config +++ b/config @@ -1 +1 @@ -Subproject commit 4633c04acbb912c49ae1d337cff5d33e3622f1f4 +Subproject commit 70107e58a4194b57ee67f55b1d669a88edd6ab7d From 8ee439bdeba93bee1f1bda207f787e2048b5864c Mon Sep 17 00:00:00 2001 From: Stefano Volpe Date: Tue, 18 Jun 2024 11:21:04 +0200 Subject: [PATCH 11/11] chore: point config submodule to main --- config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config b/config index 70107e5..76fbf43 160000 --- a/config +++ b/config @@ -1 +1 @@ -Subproject commit 70107e58a4194b57ee67f55b1d669a88edd6ab7d +Subproject commit 76fbf43d97b757c94c7336395706ff5dbca33247