From 8f26302044b3175a3d00ab332aea21b7a07689e1 Mon Sep 17 00:00:00 2001 From: Folyd Date: Sat, 22 Feb 2020 15:44:35 +0800 Subject: [PATCH] Parse book chapters recursively --- rust/src/bin/books-index.rs | 77 ++++++++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 18 deletions(-) diff --git a/rust/src/bin/books-index.rs b/rust/src/bin/books-index.rs index a9da152..269fbc1 100644 --- a/rust/src/bin/books-index.rs +++ b/rust/src/bin/books-index.rs @@ -1,39 +1,72 @@ use std::fs; use std::path::Path; -use futures::future::join_all; +use futures::future::try_join_all; use reqwest; use select::document::Document; -use select::predicate::{Class, Name, Predicate}; +use select::node::Node; +use select::predicate::Class; use serde_derive::{Deserialize, Serialize}; use serde_json; use tokio; const BOOKS_INDEX_PATH: &str = "../extension/index/books.js"; -// [title, path] -type Chapter = [String; 2]; +#[derive(Serialize, Debug)] +struct Page { + title: String, + path: String, + parent_titles: Option>, +} #[derive(Serialize, Deserialize, Debug)] struct Book { name: String, url: String, #[serde(skip_deserializing)] - chapters: Vec, + pages: Vec, +} + +fn parse_page(node: &Node) -> Page { + let a = node.first_child().unwrap(); + let title = a.text(); + let path = a + .attr("href") + .unwrap() + .trim_end_matches(".html") + .to_string(); + Page { + title, + path, + parent_titles: None, + } +} + +fn parse_node(node: &Node, parent_titles: Option>) -> Vec { + let mut pages = vec![]; + for child in node.children() { + if child.is(Class("expanded")) { + let mut page = parse_page(&child); + page.parent_titles = parent_titles.clone(); + pages.push(page); + } else { + let mut new_parent_titles = parent_titles.clone().unwrap_or(vec![]); + if let Some(page) = child.prev().map(|n| parse_page(&n)) { + new_parent_titles.push(page.title); + if let Some(section) = child.find(Class("section")).next() { + pages.extend(parse_node(§ion, Some(new_parent_titles))) + } + } + } + } + pages } async fn fetch_book(mut book: Book) -> Result> { let html = reqwest::get(&book.url).await?.text().await?; let doc = Document::from(html.as_str()); - for node in doc.find(Class("chapter").descendant(Name("a"))) { - let title = node.text(); - let path = node - .attr("href") - .unwrap() - .trim_end_matches(".html") - .to_string(); - book.chapters.push([title, path]); - } + let node = doc.find(Class("chapter")).next().unwrap(); + book.pages = parse_node(&node, None); Ok(book) } @@ -43,9 +76,17 @@ async fn main() -> Result<(), Box> { .into_iter() .map(|book| fetch_book(book)) .collect(); - let books: Vec = join_all(futures).await.into_iter().flatten().collect(); - let contents = format!("var booksIndex={};", serde_json::to_string(&books)?); - let path = Path::new(BOOKS_INDEX_PATH); - fs::write(path, &contents)?; + match try_join_all(futures).await { + Ok(result) => { + let books: Vec<_> = result.into_iter().collect(); + let contents = format!("var booksIndex={};", serde_json::to_string(&books)?); + let path = Path::new(BOOKS_INDEX_PATH); + fs::write(path, &contents)?; + } + Err(error) => { + println!("{:?}", error); + } + } + Ok(()) }