Frangipani

The goal of this project is to create a configurable and extensible crawler framework.

Features

Usage

```rust use asynctrait::asynctrait; use frangipani::{Response, Spider}; use frangipani::util::join_url; use scraper::{Html, Selector};

pub struct DexcodeSpider { }

[async_trait]

impl Spider for DexcodeSpider { fn name(&self) -> String { "dexcode-spider".to_owned() }

fn start_urls(&self) -> Vec<String> {
    vec![
        "https://dexcode.com/".to_owned(),
    ]
}

async fn parse(&self, response: Response) -> (u64, Vec<String>) {
    if response.content_type() != "text/html" {
        return (0, vec![]);
    }

    let url = response.get_url().to_owned();
    let text = response.into_string().unwrap();

    let mut urls = vec![];
    {
        let document = Html::parse_document(&text);
        let link_selector = Selector::parse("a").unwrap();
        for link in document.select(&link_selector) {
            if let Some(relative_url) = link.value().attr("href") {
                let join_url = join_url(&url, relative_url);
                let req_url = reqwest::Url::parse(&join_url).unwrap();
                if req_url.scheme() != "http" && req_url.scheme() != "https" {
                    continue;
                }
                if req_url.domain().unwrap().ends_with("dexcode.com") {
                    // Only push url with `dexcode.com` domain
                    urls.push(req_url.to_string());
                }
            }
        }

        let title_selector = Selector::parse("title").unwrap();
        let title = match document.select(&title_selector).next() {
            Some(el) => el.inner_html(),
            None => "".to_owned(),
        };
        println!("{},{}", url, title);
    }

    (1, urls)
}

}

[tokio::main]

async fn main() { env_logger::init();

let spiders: Vec<Box<dyn Spider + Send + Sync>> = vec![
    Box::new(DexcodeSpider {}),
];

let mut engine = frangipani::engine(spiders);
engine.start().await;

} ```

For continuous crawling, see examples/continuous.rs in the project repository.