From 4f8df251ad5911b2a53929fdf7eb8c7423d6fa8d Mon Sep 17 00:00:00 2001 From: Christian Seiler Date: Mon, 30 Dec 2024 12:29:30 +0100 Subject: [PATCH] Add initial (working) version of gemeinderat bot --- .gitignore | 5 + Cargo.toml | 15 +++ config.toml.example | 11 +++ src/main.rs | 234 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 265 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.toml create mode 100644 config.toml.example create mode 100644 src/main.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6d11e88 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +/target +Cargo.lock +*~ +config.toml +state.toml diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..9cf259c --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "gemeinderat-monitor" +version = "0.1.0" +edition = "2021" + +[dependencies] +reqwest = { version = "0.12" } +scraper = { version = "0.22" } +tokio = { version = "1", features = ["full"] } +url = "2.5.4" +config = "0.15.4" +serde = "1.0.217" +toml = "0.8.19" +matrix-sdk = "0.9.0" +anyhow = "1.0.95" diff --git a/config.toml.example b/config.toml.example new file mode 100644 index 0000000..3f4654b --- /dev/null +++ b/config.toml.example @@ -0,0 +1,11 @@ +keywords = ["kamera", "video", "überwarchung", "künstlich", "intelligenz"] +overview_url = "https://www.tuebingen.de/gemeinderat/info.php" +detail_link_path = "si0057.php" +id_query_key = "__ksinr" +detail_generate_url = "https://www.tuebingen.de/gemeinderat/si0056.php" +id_generate_key = "__ksinr" +matrix_server_url = "https://matrix.org" +matrix_user = "cttue-bottest-gemeinderat" +matrix_password = "XXXXXXXXXXXXX" +matrix_room = "cttue-bottest-room" + diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..dbd47ca --- /dev/null +++ b/src/main.rs @@ -0,0 +1,234 @@ +use std::collections::{HashSet, HashMap}; +use serde::{Serialize, Deserialize}; +use std::fs; +use std::io::Write; +use anyhow::Result; +use matrix_sdk::{ + config::SyncSettings, + Client, Room, + ruma::events::room::message::RoomMessageEventContent, +}; + +#[derive(Serialize, Deserialize, Debug, PartialEq, Default)] +struct Config { + keywords : Vec, + overview_url : String, // "https://www.tuebingen.de/gemeinderat/info.php" + detail_link_path : String, // "si0057.php" + id_query_key : String, // "__ksinr" + detail_generate_url : String, // "https://www.tuebingen.de/gemeinderat/si0056.php" + id_generate_key : String, // "__ksinr" + matrix_server_url : String, + matrix_user : String, + matrix_password : String, + matrix_room : String, +} + +#[derive(Serialize, Deserialize, Debug, PartialEq, Default)] +struct ScrapeResult { + by_id : HashMap>, +} + +#[derive(Serialize, Deserialize, Debug, PartialEq, Default)] +struct State { + scrape_result : Option, + error : Option, +} + +fn box_error(error: T) -> Box { + Box::new(error) +} + +async fn scrape_websites(config: &Config) -> Result> { + let overview_url = url::Url::parse(config.overview_url.as_str())?; + let resp = reqwest::get(overview_url.as_str()) + .await? + .text() + .await?; + let fragment = scraper::Html::parse_document(&resp); + let selector = scraper::Selector::parse("a[href]").unwrap(); + let mut ids = Vec::new(); + for element in fragment.select(&selector) { + let link_target = overview_url.join(element.attr("href").unwrap()); + if let Ok(link_target) = link_target { + if link_target.path().contains(&config.detail_link_path) { + let mut nr : Option = None; + for (k, v) in link_target.query_pairs() { + if k == config.id_query_key { + nr = Some(v.to_string()); + break; + } + } + if let Some(nr) = nr { + ids.push(nr); + } + } + } + } + let base_sub_url = url::Url::parse(config.detail_generate_url.as_str())?; + let selectors = vec![ + scraper::Selector::parse("div.smc-card-text-title").unwrap(), + scraper::Selector::parse("div.card-body").unwrap(), + ]; + let mut by_id = HashMap::>::new(); + for id in ids { + let mut sub_url = base_sub_url.clone(); + let query = format!("{}={}", config.id_generate_key, &id); + sub_url.set_query(Some(&query)); + let resp = reqwest::get(sub_url.as_str()) + .await? + .text() + .await?; + let fragment = scraper::Html::parse_document(&resp); + let mut contained_keywords = HashSet::::new(); + for selector in &selectors { + for element in fragment.select(selector) { + for text in element.text() { + for n in 0..config.keywords.len() { + let keyword = &config.keywords[n]; + if text.to_lowercase().contains(&keyword.to_lowercase()) { + contained_keywords.insert(keyword.to_string()); + } + } + } + } + } + + if !contained_keywords.is_empty() { + by_id.insert(id, contained_keywords.into_iter().collect()); + } + } + + Ok(ScrapeResult{by_id}) +} + +async fn post_message(room: &Room, message: &str) -> Result<(), Box> +{ + room.send(RoomMessageEventContent::text_plain(message)).await?; + Ok(()) +} + +fn get_keyword_messages(config: &Config, old_result: Option<&ScrapeResult>, new_result: &ScrapeResult) -> Vec +{ + let base_sub_url = url::Url::parse(config.detail_generate_url.as_str()).unwrap(); + let mut new_pages = Vec::::new(); + for (id, new_keywords) in &new_result.by_id { + let changed = { + if let Some(old_result) = old_result { + if let Some(old_keywords) = old_result.by_id.get(id) { + old_keywords != new_keywords + } else { + true + } + } else { + true + } + }; + if changed { + let mut sub_url = base_sub_url.clone(); + let query = format!("{}={}", config.id_generate_key, &id); + sub_url.set_query(Some(&query)); + if !new_keywords.is_empty() { + new_pages.push(format!("Auf der Seite {} sind folgende Keywords in der Tagesordnung gefunden worden (neue Seite oder Änderung an den Keywords): {:?}", + sub_url.as_str(), new_keywords)); + } + } + } + new_pages +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let builder = config::Config::builder() + .add_source(config::File::new("config.toml", config::FileFormat::Toml)); + + let config : Config = builder.build()?.try_deserialize()?; + + let matrix_server_url = url::Url::parse(&config.matrix_server_url).expect("Couldn't parse the matrix server URL"); + let client = Client::new(matrix_server_url).await.unwrap(); + client + .matrix_auth() + .login_username(&config.matrix_user, &config.matrix_password) + .initial_device_display_name("gemeinderat-bot") + .await?; + + client.sync_once(SyncSettings::default()).await?; + + let mut room : Option = None; + for r in client.rooms() { + if let Some(name) = r.name() { + if name == config.matrix_room { + room = Some(r); + break; + } + } + } + let room = room.ok_or(Box::::from("The room was not found on the server"))?; + + let mut last_state = State { scrape_result: None, error: None }; + if let Ok(state_file_contents) = fs::read_to_string("state.toml") { + if let Ok(state_file_contents) = toml::from_str(state_file_contents.as_str()) { + last_state = state_file_contents + } + } + + let scrape_result = scrape_websites(&config).await; + let mut new_state = State { scrape_result: None, error: None }; + + match (&last_state.scrape_result, &last_state.error, &scrape_result) { + (_, Some(old_error_message), Err(err)) => { + let new_error_message = format!("{}", err); + if new_error_message != *old_error_message { + post_message(&room, format!("Euer freundlicher Gemeinderat-Bot konnte die Gemeinderats-Seite nicht auslesen (der Fehler hat sich seit dem letzten Mal geändert): {}", new_error_message).as_str()).await?; + } + }, + (_, _, Err(err)) => { + let new_error_message = format!("{}", err); + post_message(&room, format!("Euer freundlicher Gemeinderat-Bot konnte die Gemeinderats-Seite nicht auslesen: {}", new_error_message).as_str()).await?; + }, + (Some(old_result), _, Ok(new_result)) => { + let messages = get_keyword_messages(&config, Some(old_result), new_result); + if !messages.is_empty() { + post_message(&room, "Euer freundlicher Gemeinderat-Bot hat neue Gemeinderatssitzungen gefunden, in denen beobachtete Keywords in der Tagesordnung gefunden wurden:").await?; + for message in &messages { + post_message(&room, message).await?; + } + } else if old_result != new_result { + post_message(&room, "Alle Gemeinderatssitzungen, in denen beobachtete Keywords in der Tagesordnung gefunden wurden, sind nicht mehr auf der Webseite.").await?; + } + }, + (None, Some(_old_error), Ok(new_result)) => { + let messages = get_keyword_messages(&config, None, new_result); + if messages.is_empty() { + post_message(&room, "Euer freundlicher Gemeinderat-Bot funktioniert nach dem letzten Fehler wieder. Es sind aktuell keine Sitzungen mit beobachteten Keywords auf der Webseite.").await?; + } else { + post_message(&room, "Euer freundlicher Gemeinderat-Bot funktioniert nach dem letzten Fehler wieder. Folgende Sitzungen haben beobachtete Keywords auf der Tagesordnung:").await?; + for message in &messages { + post_message(&room, message).await?; + } + } + }, + (None, None, Ok(new_result)) => { + let messages = get_keyword_messages(&config, None, new_result); + if messages.is_empty() { + post_message(&room, "Euer freundlicher Gemeinderat-Bot ist zum ersten Mal durchgelaufen. Es sind aktuell keine Sitzungen mit beobachteten Keywords auf der Webseite.").await?; + } else { + post_message(&room, "Euer freundlicher Gemeinderat-Bot ist zum ersten Mal durchgelaufen. Folgende Sitzungen haben beobachtete Keywords auf der Tagesordnung:").await?; + for message in &messages { + post_message(&room, message).await?; + } + } + }, + }; + + match scrape_result { + Ok(result) => { new_state.scrape_result = Some(result); }, + Err(err) => { new_state.error = Some(format!("{}", err)); }, + }; + + if let Err(error) = toml::to_string(&new_state).map_err(box_error).and_then(|new_state| std::fs::File::create("state.toml").map_err(box_error)?.write_all(new_state.as_bytes()).map_err(box_error)) { + println!("Could not save state"); + dbg!(error); + } + + Ok(()) +}