Parse book chapters recursively

This commit is contained in:
Folyd 2020-02-22 15:44:35 +08:00
parent e0a35b8c47
commit 8f26302044

View file

@ -1,39 +1,72 @@
use std::fs;
use std::path::Path;
use futures::future::join_all;
use futures::future::try_join_all;
use reqwest;
use select::document::Document;
use select::predicate::{Class, Name, Predicate};
use select::node::Node;
use select::predicate::Class;
use serde_derive::{Deserialize, Serialize};
use serde_json;
use tokio;
const BOOKS_INDEX_PATH: &str = "../extension/index/books.js";
// [title, path]
type Chapter = [String; 2];
#[derive(Serialize, Debug)]
struct Page {
title: String,
path: String,
parent_titles: Option<Vec<String>>,
}
#[derive(Serialize, Deserialize, Debug)]
struct Book {
name: String,
url: String,
#[serde(skip_deserializing)]
chapters: Vec<Chapter>,
pages: Vec<Page>,
}
fn parse_page(node: &Node) -> Page {
let a = node.first_child().unwrap();
let title = a.text();
let path = a
.attr("href")
.unwrap()
.trim_end_matches(".html")
.to_string();
Page {
title,
path,
parent_titles: None,
}
}
fn parse_node(node: &Node, parent_titles: Option<Vec<String>>) -> Vec<Page> {
let mut pages = vec![];
for child in node.children() {
if child.is(Class("expanded")) {
let mut page = parse_page(&child);
page.parent_titles = parent_titles.clone();
pages.push(page);
} else {
let mut new_parent_titles = parent_titles.clone().unwrap_or(vec![]);
if let Some(page) = child.prev().map(|n| parse_page(&n)) {
new_parent_titles.push(page.title);
if let Some(section) = child.find(Class("section")).next() {
pages.extend(parse_node(&section, Some(new_parent_titles)))
}
}
}
}
pages
}
async fn fetch_book(mut book: Book) -> Result<Book, Box<dyn std::error::Error>> {
let html = reqwest::get(&book.url).await?.text().await?;
let doc = Document::from(html.as_str());
for node in doc.find(Class("chapter").descendant(Name("a"))) {
let title = node.text();
let path = node
.attr("href")
.unwrap()
.trim_end_matches(".html")
.to_string();
book.chapters.push([title, path]);
}
let node = doc.find(Class("chapter")).next().unwrap();
book.pages = parse_node(&node, None);
Ok(book)
}
@ -43,9 +76,17 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
.into_iter()
.map(|book| fetch_book(book))
.collect();
let books: Vec<Book> = join_all(futures).await.into_iter().flatten().collect();
let contents = format!("var booksIndex={};", serde_json::to_string(&books)?);
let path = Path::new(BOOKS_INDEX_PATH);
fs::write(path, &contents)?;
match try_join_all(futures).await {
Ok(result) => {
let books: Vec<_> = result.into_iter().collect();
let contents = format!("var booksIndex={};", serde_json::to_string(&books)?);
let path = Path::new(BOOKS_INDEX_PATH);
fs::write(path, &contents)?;
}
Err(error) => {
println!("{:?}", error);
}
}
Ok(())
}