parsercher

Crate API

Parses and searches Tag documents. (e.g. HTML, XML)

parsercher parses documents written in tags such as HTML and XML. - Create a tree of Dom structures from the tag document. - Search for tags and text in the tree of Dom structures.

Usage

Add this to your Cargo.toml: [dependencies] parsercher = "1.0.0"

License

MIT OR Apache-2.0

Examples

Example of getting text from HTML.
Create a tree of Dom structure from HTML and get the text of li tag that value of class attribute is target. ```rust use std::collections::HashMap; use parsercher; use parsercher::dom::Tag;

let html = r#" sample html

  1. first
  2. second
  3. therd
"#;

if let Ok(rootdom) = parsercher::parse(&html) { let mut needle = Tag::new("li".tostring()); let mut attr = HashMap::new(); attr.insert("class".tostring(), "target".tostring()); needle.set_attr(attr);

if let Some(texts) = parsercher::search_text_from_tag_children(&root_dom, &needle) {
    assert_eq!(texts.len(), 2);
    assert_eq!(texts[0], "first".to_string());
    assert_eq!(texts[1], "therd".to_string());
}

} ```

More complex examples of Dom structure tree ```rust use parsercher;

let html = r#" sample html

Hello, world!

<div id="content"></div>

<ol>
  <li>first</li>
  <li>second</li>
  <li>therd</li>
</ol>
<!-- All script code becomes one text -->

"#;

if let Ok(dom) = parsercher::parse(&html) { println!("{:#?}", dom); } ```

output: Dom { dom_type: Tag, tag: Some( Tag { name: "root", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Tag, tag: Some( Tag { name: "!DOCTYPE", attr: Some( { "html": "", }, ), terminated: false, terminator: false, }, ), text: None, comment: None, children: None, }, Dom { dom_type: Tag, tag: Some( Tag { name: "html", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Tag, tag: Some( Tag { name: "head", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Tag, tag: Some( Tag { name: "meta", attr: Some( { "charset": "UTF-8", }, ), terminated: false, terminator: false, }, ), text: None, comment: None, children: None, }, Dom { dom_type: Tag, tag: Some( Tag { name: "title", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Text, tag: None, text: Some( Text { text: "sample html", }, ), comment: None, children: None, }, ], ), }, ], ), }, Dom { dom_type: Tag, tag: Some( Tag { name: "body", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Tag, tag: Some( Tag { name: "h1", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Text, tag: None, text: Some( Text { text: "Hello, world!", }, ), comment: None, children: None, }, ], ), }, Dom { dom_type: Tag, tag: Some( Tag { name: "div", attr: Some( { "id": "content", }, ), terminated: false, terminator: false, }, ), text: None, comment: None, children: None, }, Dom { dom_type: Tag, tag: Some( Tag { name: "ol", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Tag, tag: Some( Tag { name: "li", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Text, tag: None, text: Some( Text { text: "first", }, ), comment: None, children: None, }, ], ), }, Dom { dom_type: Tag, tag: Some( Tag { name: "li", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Text, tag: None, text: Some( Text { text: "second", }, ), comment: None, children: None, }, ], ), }, Dom { dom_type: Tag, tag: Some( Tag { name: "li", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Text, tag: None, text: Some( Text { text: "therd", }, ), comment: None, children: None, }, ], ), }, ], ), }, Dom { dom_type: Comment, tag: None, text: None, comment: Some( Comment { comment: " All script code becomes one text ", }, ), children: None, }, Dom { dom_type: Tag, tag: Some( Tag { name: "script", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Text, tag: None, text: Some( Text { text: "\n let content = document.getElementById(\'content\');\n content.textContent = \'content\';\n", }, ), comment: None, children: None, }, ], ), }, ], ), }, ], ), }, ], ), }