close
The Wayback Machine - https://web.archive.org/web/20200910035235/https://github.com/Hexilee/unhtml.rs
Skip to content
master
Go to file
Code

Latest commit

 

Git stats

Files

Permalink
Failed to load latest commit information.
Type
Name
Latest commit message
Commit time
src
 
 
 
 
 
 
 
 
 
 
 
 
 
 

README.md

HTML deserializer for rust

Build status License: MIT

unhtml

Crate version Rust Docs

unhtml_derive

Crate version Rust Docs


Table of Contents

Derive Target

struct

Basic Usage

#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
#[html(selector = "#test")]
struct SingleUser {
    #[html(selector = "p:nth-child(1)", attr = "inner")]
    name: String,

    #[html(selector = "p:nth-child(2)", attr = "inner")]
    age: u8,

    #[html(selector = "p:nth-child(3)", attr = "inner")]
    like_lemon: bool,
}

let user = SingleUser::from_html(r#"<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <div id="test">
        <div>
            <p>Hexilee</p>
            <p>20</p>
            <p>true</p>
        </div>
    </div>
</body>
</html>"#).unwrap();
assert_eq!("Hexilee", &user.name);
assert_eq!(20, user.age);
assert!(user.like_lemon);

Attributes

html

target

derive target or field

specification

#[html(selector = "...", attr = "...", default = ...)]

selector, attr, default or html itself can be unnecessary.

This is valid

#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};


#[derive(FromHtml)]
struct SingleString {
    value: String,
}

selector

target

derive target or field

literal type

string

specification

selector must be a valid css-selector, invalid selector will cause a compile-time panic

// panic
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
#[html(selector = "<>")]
struct SingleUser {}
// panic
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::*;

#[derive(FromHtml)]
struct SingleUser {
    #[html(selector = "<>", attr = "inner")]
    name: String,
}

if multi element is selected and field type is not Vec, the first will be chosen

#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
#[html(selector = "a")]
struct Link {
    #[html(attr = "href")]
    href: String,

    #[html(attr = "inner")]
    value: String,
}

let link = Link::from_html(r#"
<a href="https://github.com">Github</a> 
<a href="https://google.com">Google</a> 
"#).unwrap();
assert_eq!("https://github.com", &link.href);
assert_eq!("Github", &link.value);
default behavior

html of its root element

#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Link {
    #[html(attr = "href")]
    href: String,

    #[html(attr = "inner")]
    value: String,
}

let link = Link::from_html(r#"<a href="https://github.com">Github</a>"#).unwrap();
assert_eq!("https://github.com", &link.href);
assert_eq!("Github", &link.value);

attr

target

field

literal type

string

specification
  • inner refer to innerHtml
  • any other attr refer to html element attribute
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Link {
    #[html(attr = "href")]
    href: String,

    #[html(attr = "inner")]
    value: String,
}

let link = Link::from_html(r#"<a href="https://github.com">Github</a>"#).unwrap();
assert_eq!("https://github.com", &link.href);
assert_eq!("Github", &link.value);
default behavior

html of the whole element (not innerHtml!)

#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Link {
    #[html(attr = "href")]
    href: String,

    #[html(attr = "inner")]
    value: String,
    
    source: String,
}

let link = Link::from_html(r#"<a href="https://github.com">Github</a>"#).unwrap();
assert_eq!("https://github.com", &link.href);
assert_eq!("Github", &link.value);
assert_eq!(r#"<a href="https://github.com">Github</a>"#, &link.source);

default

target

field

literal type

any literal type

specification
  • the same type with field
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct DefaultUser {
    // invoke String::from_html
    #[html(selector = "#non-exist", default = "Hexilee")]
    name: String,

    // invoke u8::from<u8>
    #[html(default = 20)]
    age: u8,

    #[html(default = true)]
    like_lemon: bool,
}

let user = DefaultUser::from_html("<p></p>").unwrap();
assert_eq!("Hexilee", &user.name);
assert_eq!(20, user.age);
assert_eq!(-1000, user.assets);
assert!(user.like_lemon);
  • string
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Link {
    #[html(attr = "href")]
    href: String,

    #[html(attr = "inner")]
    value: String,
}

#[derive(FromHtml)]
struct Website {
    #[html(default = "10")]
    age: u8,

    #[html(default = "<a href='https://github.com'>Github</a>")]
    link: Link,
}

let website = Website::from_html("<p></p>").unwrap();
let link = website.link;
assert_eq!(10u8, website.age);
assert_eq!("https://github.com", &link.href);
assert_eq!("Github", &link.value);
default behavior

return a Err(unhtml::failure::Error) when selected nothing

// panic
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Link {
    // no default
    #[html(attr = "href")]
    href: String,

    #[html(attr = "inner")]
    value: String,
}

let link = Link::from_html(r#"<a>Github</a>"#).unwrap();

Field Type

any type implemented FromHtml, without generics
// panic
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Link {
    // no default
    #[html(attr = "href")]
    href: &str,
}
// panic
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Website {
    // no default
    #[html(attr = "href")]
    hrefs: std::collections::LinkedList<String>,
}
// panic
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Website {
    // no default
    #[html(attr = "href")]
    hrefs: [String],
}
Vec

Should use unhtml::VecFromHtml

extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml, VecFromHtml};

#[derive(FromHtml)]
struct TestUser {
    #[html(selector = "p:nth-child(1)", attr = "inner")]
    name: String,

    #[html(selector = "p:nth-child(2)", attr = "inner")]
    age: u8,

    #[html(selector = "p:nth-child(3)", attr = "inner")]
    like_lemon: bool,
}

#[derive(FromHtml)]
#[html(selector = "#test")]
struct TestUsers {
    #[html(selector = "div")]
    users: Vec<TestUser>,
}

let users = TestUsers::from_html(r#"<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <div id="test">
        <div>
            <p>Hexilee</p>
            <p>20</p>
            <p>true</p>
        </div>
        <div>
            <p>BigBrother</p>
            <p>21</p>
            <p>false</p>
        </div>
    </div>
</body>
</html>"#).unwrap();
let hexilee = &users.users[0];
let big_brother = &users.users[1];
assert_eq!("Hexilee", &hexilee.name);
assert_eq!(20, hexilee.age);
assert!(hexilee.like_lemon);
assert_eq!("BigBrother", &big_brother.name);
assert_eq!(21, big_brother.age);
assert!(!big_brother.like_lemon);

as the documentation of crate unhtml, if you want Vec<TestUser> straightly, you can just:

extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, VecFromHtml};

#[derive(FromHtml)]
struct TestUser {
    #[html(selector = "p:nth-child(1)", attr = "inner")]
    name: String,

    #[html(selector = "p:nth-child(2)", attr = "inner")]
    age: u8,

    #[html(selector = "p:nth-child(3)", attr = "inner")]
    like_lemon: bool,
}

let users = Vec::<TestUser>::from_html("#test > div", r#"<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <div id="test">
        <div>
            <p>Hexilee</p>
            <p>20</p>
            <p>true</p>
        </div>
        <div>
            <p>BigBrother</p>
            <p>21</p>
            <p>false</p>
        </div>
    </div>
</body>
</html>"#).unwrap();
let hexilee = &users[0];
let big_brother = &users[1];
assert_eq!("Hexilee", &hexilee.name);
assert_eq!(20, hexilee.age);
assert!(hexilee.like_lemon);
assert_eq!("BigBrother", &big_brother.name);
assert_eq!(21, big_brother.age);
assert!(!big_brother.like_lemon);

Source HTML

with top selector

all source html will be parsed as fragment. The top element is html and there is no DOCTYPE, head or body.

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <div id="test">
        <div>
            <p>Hexilee</p>
            <p>20</p>
            <p>true</p>
        </div>
    </div>
</body>
</html>

will be parsed as:

<html lang="en">
    <meta charset="UTF-8">
    <title>Title</title>
    <div id="test">
        <div>
            <p>Hexilee</p>
            <p>20</p>
            <p>true</p>
        </div>
    </div>
</html>

and

<p>Hexilee</p>

will be parsed as:

<html>
    <p>Hexilee</p>
</html>    
// panic

extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Document {
    // no default
    #[html(selector = "head")]
    head: String,

    #[html(selector = "body")]
    body: String,
}

let dicument = Document::from_html(r#"<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <div id="test">
        <div>
            <p>Hexilee</p>
            <p>20</p>
            <p>true</p>
        </div>
    </div>
</body>
</html>"#).unwrap();
without top selector

when derived struct doesn't have top selector, all source html will be parsed as pure fragment. There is no DOCTYPE, html, head or body.

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <div id="test">
        <div>
            <p>Hexilee</p>
            <p>20</p>
            <p>true</p>
        </div>
    </div>
</body>
</html>

will be parsed as:

<meta charset="UTF-8">
<title>Title</title>
<div id="test">
   <div>
       <p>Hexilee</p>
       <p>20</p>
       <p>true</p>
   </div>
</div>

and

<p>Hexilee</p>

will be parsed as:

<p>Hexilee</p>
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};

#[derive(FromHtml)]
struct Link {
    #[html(attr = "href")]
    href: String,

    #[html(attr = "inner")]
    value: String,
}

let link = Link::from_html(r#"<a href="https://github.com">Github</a>"#).unwrap();
assert_eq!("https://github.com", &link.href);
assert_eq!("Github", &link.value);

About

A magic html parser

Topics

Resources

License

Packages

No packages published

Languages

You can’t perform that action at this time.