crates.io Dependency status

Crusty-core - build your own web crawler!

Example - crawl single website, collect information about TITLE tags

```rust use crusty_core::prelude::*;

[derive(Debug, Clone, Default)]

pub struct JobState { sumtitlelen: usize }

[derive(Debug, Clone, Default)]

pub struct TaskState { title: String }

pub struct DataExtractor {} type Ctx = JobCtx; impl TaskExpander for DataExtractor { fn expand(&self, ctx: &mut Ctx, : &Task, _: &HttpStatus, doc: &Document) -> taskexpanders::ExtResult { if let Some(title) = doc.find(Name("title")).next().map(|v|v.text()) { ctx.jobstate.lock().unwrap().sumtitlelen += title.len(); ctx.taskstate.title = title; } Ok(()) } }

[tokio::main]

async fn main() -> anyhow::Result<()> { let crawler = Crawler::new_default()?;

let settings = config::CrawlingSettings::default(); let rules = CrawlingRules::default().withtaskexpander(|| DataExtractor{} );

let job = Job::new("https://example.com", settings, rules, JobState::default())?; for r in crawler.iter(job) { println!("- {}, task state: {:?}", r, r.ctx.taskstate); if let JobStatus::Finished() = r.status { println!("final job state: {:?}", r.ctx.job_state.lock().unwrap()); } } Ok(()) } ```

If you want to get more fancy and configure some stuff or control your imports more precisely ```rust use crusty_core::prelude::*;

[derive(Debug, Clone, Default)]

pub struct JobState { sumtitlelen: usize }

[derive(Debug, Clone, Default)]

pub struct TaskState { title: String }

pub struct DataExtractor {} type Ctx = JobCtx; impl TaskExpander for DataExtractor { fn expand(&self, ctx: &mut Ctx, : &Task, _: &HttpStatus, doc: &Document) -> taskexpanders::ExtResult { if let Some(title) = doc.find(Name("title")).next().map(|v|v.text()) { ctx.jobstate.lock().unwrap().sumtitlelen += title.len(); ctx.taskstate.title = title; } Ok(()) } }

[tokio::main]

async fn main() -> anyhow::Result<()> { let crawler = Crawler::new_default()?;

let settings = config::CrawlingSettings::default(); let rules = CrawlingRules::default().withtaskexpander(|| DataExtractor{} );

let job = Job::new("https://example.com", settings, rules, JobState::default())?; for r in crawler.iter(job) { println!("- {}, task state: {:?}", r, r.ctx.taskstate); if let JobStatus::Finished() = r.status { println!("final job state: {:?}", r.ctx.job_state.lock().unwrap()); } } Ok(()) } ```

Install

Simply add this to your Cargo.toml [dependencies] crusty-core = "~0.14.0"

Key capabilities

Notes

Please see examples for more complicated usage scenarios. This crawler is more verbose than some others, but it allows incredible customization at each and every step.

If you are interested in the area of broad web crawling there's crusty, developed fully on top of crusty-core that tries to tackle on some challenges of broad web crawling