Parses and searches Tag documents. (e.g. HTML, XML)
parsercher parses documents written in tags such as HTML and XML. - Create a Dom structure tree from the tag document. - Search for tags and text from the Dom structure tree. - Search subtrees from the Dom structure tree.
Add this to your Cargo.toml
:
[dependencies]
parsercher = "3.1.4"
MIT OR Apache-2.0
Example of getting text from HTML.
Create a tree of Dom structure from HTML and get the text of li
tag that value of class
attribute is target
.
```rust
use parsercher;
use parsercher::dom::Tag;
let html = r#"
if let Ok(rootdom) = parsercher::parse(&html) { let mut needle = Tag::new("li"); needle.setattr("class", "target");
if let Some(texts) = parsercher::search_text_from_tag_children(&root_dom, &needle) {
assert_eq!(texts.len(), 2);
assert_eq!(texts[0], "first".to_string());
assert_eq!(texts[1], "therd".to_string());
}
} ```
Example of searching a subtree from the Dom structure tree.
Find a subtree that has a ul
tag whose value in the class
attribute is targetList
and
two li
tags under it. Also, the values of the class
attribute of the li
tag must be
key1
andkey2
, respectively.
Looking for: ```text
```
```rust use parsercher;
let doc = r#"
<ul id="list2">
<li class="key1">2-1</li>
<li>2-2</li>
</ul>
<div>
<div>
<ul class="targetList">
<ul id="list3" class="targetList">
<li class="key1">3-1</li>
<li class="item">3-2</li>
<li class="key2">3-3</li>
</ul>
</ul>
</div>
</div>
<ul id="list4">
<li class="key1">4-1</li>
<li class="key2">4-2</li>
</ul>
"#;
let root_dom = parsercher::parse(&doc).unwrap();
let needle = r#"
"#;
let needledom = parsercher::parse(&needle).unwrap();
// Remove root
dom of needledom
let needledom = needledom.get_children().unwrap().get(0).unwrap();
if let Some(dom) = parsercher::searchdom(&rootdom, &needledom) {
parsercher::printdom_tree(&dom);
}
output:
text
More complex examples of Dom structure tree ```rust use parsercher;
let html = r#"
<div id="content"></div>
<ol>
<li>first</li>
<li>second</li>
<li>therd</li>
</ol>
<!-- All script code becomes one text -->
"#;
if let Ok(dom) = parsercher::parse(&html) { println!("{:#?}", dom); } ```
output:
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "root",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "!DOCTYPE",
attr: Some(
{
"html": "",
},
),
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: None,
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "html",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "head",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "meta",
attr: Some(
{
"charset": "UTF-8",
},
),
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: None,
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "title",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Text,
tag: None,
text: Some(
Text {
text: "sample html",
},
),
comment: None,
children: None,
},
],
),
},
],
),
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "body",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "h1",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Text,
tag: None,
text: Some(
Text {
text: "Hello, world!",
},
),
comment: None,
children: None,
},
],
),
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "div",
attr: Some(
{
"id": "content",
},
),
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: None,
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "ol",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "li",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Text,
tag: None,
text: Some(
Text {
text: "first",
},
),
comment: None,
children: None,
},
],
),
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "li",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Text,
tag: None,
text: Some(
Text {
text: "second",
},
),
comment: None,
children: None,
},
],
),
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "li",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Text,
tag: None,
text: Some(
Text {
text: "therd",
},
),
comment: None,
children: None,
},
],
),
},
],
),
},
Dom {
dom_type: Comment,
tag: None,
text: None,
comment: Some(
Comment {
comment: " All script code becomes one text ",
},
),
children: None,
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "script",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Text,
tag: None,
text: Some(
Text {
text: "\n let content = document.getElementById(\'content\');\n content.textContent = \'content\';\n",
},
),
comment: None,
children: None,
},
],
),
},
],
),
},
],
),
},
],
),
}