mirror of
https://github.com/huhu/rust-search-extension
synced 2024-11-14 23:57:07 +00:00
Parse book chapters recursively
This commit is contained in:
parent
e0a35b8c47
commit
8f26302044
1 changed files with 59 additions and 18 deletions
|
@ -1,39 +1,72 @@
|
|||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use futures::future::join_all;
|
||||
use futures::future::try_join_all;
|
||||
use reqwest;
|
||||
use select::document::Document;
|
||||
use select::predicate::{Class, Name, Predicate};
|
||||
use select::node::Node;
|
||||
use select::predicate::Class;
|
||||
use serde_derive::{Deserialize, Serialize};
|
||||
use serde_json;
|
||||
use tokio;
|
||||
|
||||
const BOOKS_INDEX_PATH: &str = "../extension/index/books.js";
|
||||
|
||||
// [title, path]
|
||||
type Chapter = [String; 2];
|
||||
#[derive(Serialize, Debug)]
|
||||
struct Page {
|
||||
title: String,
|
||||
path: String,
|
||||
parent_titles: Option<Vec<String>>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
struct Book {
|
||||
name: String,
|
||||
url: String,
|
||||
#[serde(skip_deserializing)]
|
||||
chapters: Vec<Chapter>,
|
||||
pages: Vec<Page>,
|
||||
}
|
||||
|
||||
fn parse_page(node: &Node) -> Page {
|
||||
let a = node.first_child().unwrap();
|
||||
let title = a.text();
|
||||
let path = a
|
||||
.attr("href")
|
||||
.unwrap()
|
||||
.trim_end_matches(".html")
|
||||
.to_string();
|
||||
Page {
|
||||
title,
|
||||
path,
|
||||
parent_titles: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_node(node: &Node, parent_titles: Option<Vec<String>>) -> Vec<Page> {
|
||||
let mut pages = vec![];
|
||||
for child in node.children() {
|
||||
if child.is(Class("expanded")) {
|
||||
let mut page = parse_page(&child);
|
||||
page.parent_titles = parent_titles.clone();
|
||||
pages.push(page);
|
||||
} else {
|
||||
let mut new_parent_titles = parent_titles.clone().unwrap_or(vec![]);
|
||||
if let Some(page) = child.prev().map(|n| parse_page(&n)) {
|
||||
new_parent_titles.push(page.title);
|
||||
if let Some(section) = child.find(Class("section")).next() {
|
||||
pages.extend(parse_node(§ion, Some(new_parent_titles)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
pages
|
||||
}
|
||||
|
||||
async fn fetch_book(mut book: Book) -> Result<Book, Box<dyn std::error::Error>> {
|
||||
let html = reqwest::get(&book.url).await?.text().await?;
|
||||
let doc = Document::from(html.as_str());
|
||||
for node in doc.find(Class("chapter").descendant(Name("a"))) {
|
||||
let title = node.text();
|
||||
let path = node
|
||||
.attr("href")
|
||||
.unwrap()
|
||||
.trim_end_matches(".html")
|
||||
.to_string();
|
||||
book.chapters.push([title, path]);
|
||||
}
|
||||
let node = doc.find(Class("chapter")).next().unwrap();
|
||||
book.pages = parse_node(&node, None);
|
||||
Ok(book)
|
||||
}
|
||||
|
||||
|
@ -43,9 +76,17 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
.into_iter()
|
||||
.map(|book| fetch_book(book))
|
||||
.collect();
|
||||
let books: Vec<Book> = join_all(futures).await.into_iter().flatten().collect();
|
||||
let contents = format!("var booksIndex={};", serde_json::to_string(&books)?);
|
||||
let path = Path::new(BOOKS_INDEX_PATH);
|
||||
fs::write(path, &contents)?;
|
||||
match try_join_all(futures).await {
|
||||
Ok(result) => {
|
||||
let books: Vec<_> = result.into_iter().collect();
|
||||
let contents = format!("var booksIndex={};", serde_json::to_string(&books)?);
|
||||
let path = Path::new(BOOKS_INDEX_PATH);
|
||||
fs::write(path, &contents)?;
|
||||
}
|
||||
Err(error) => {
|
||||
println!("{:?}", error);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue