Small library to fetch info about a web page: title, description, language, HTTP info, RSS feeds, Opengraph, Schema.org, and more
```rust use webpage::{Webpage, WebpageOptions};
let info = Webpage::from_url("http://www.rust-lang.org/en-US/", WebpageOptions::default()) .expect("Could not read from URL");
// the HTTP transfer info let http = info.http;
asserteq!(http.ip, "54.192.129.71".tostring()); assert!(http.headers[0].startswith("HTTP")); assert!(http.body.startswith("")); asserteq!(http.url, "https://www.rust-lang.org/en-US/".tostring()); // followed redirects (HTTPS) asserteq!(http.contenttype, "text/html".to_string());
// the parsed HTML info let html = info.html;
asserteq!(html.title, Some("The Rust Programming Language".tostring())); asserteq!(html.description, Some("A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".tostring())); asserteq!(html.opengraph.ogtype, "website".to_string()); ```
You can also get HTML info about local data:
rust
use webpage::HTML;
let html = HTML::from_file("index.html", None);
// or let html = HTML::from_string(input, None);
If you need to be able to serialize the data provided by the library using serde, you can include specify the serde
feature while declaring your dependencies in Cargo.toml
:
toml
webpage = { version = "1.1", features = ["serde"] }
The curl
feature is enabled by default but is optional. This is useful if you do not need a HTTP client but already have the HTML data at hand.
```rust pub struct Webpage { pub http: HTTP, // info about the HTTP transfer pub html: HTML, // info from the parsed HTML doc }
pub struct HTTP {
pub ip: String,
pub transfertime: Duration,
pub redirectcount: u32,
pub contenttype: String,
pub responsecode: u32,
pub headers: Vec
pub struct HTML {
pub title: Option
pub url: Option<String>, // canonical url
pub feed: Option<String>, // RSS feed typically
pub language: Option<String>, // as specified, not detected
pub text_content: String, // all tags stripped from body
pub meta: HashMap<String, String>, // flattened down list of meta properties
pub opengraph: Opengraph,
pub schema_org: Vec<SchemaOrg>,
}
pub struct Opengraph {
pub og_type: String,
pub properties: HashMap
pub images: Vec<Object>,
pub videos: Vec<Object>,
pub audios: Vec<Object>,
}
// Facebook's Opengraph structured data
pub struct OpengraphObject {
pub url: String,
pub properties: HashMap
// Google's schema.org structured data pub struct SchemaOrg { pub schematype: String, pub value: serdejson::Value, } ```
The following configurations are available: ```rust pub struct WebpageOptions { allowinsecure: false, followlocation: true, maxredirections: 5, timeout: Duration::fromsecs(10), useragent: "Webpage - Rust crate - https://crates.io/crates/webpage".to_string(), }
// usage let options = WebpageOptions { allowinsecure: true, ..Default::default() }; let info = Webpage::fromurl(&url, options).expect("Halp, could not fetch"); ```