xml2: Parse XML
より簡易な手法でR上でXMLやHTMLを分析する
- CRAN: http://cran.r-project.org/web/packages/xml2/index.html
- Vignettes:
- GitHub: https://github.com/hadley/xml2
> library(xml2)
バージョン: 1.0.0
関数名 | 概略 |
---|---|
as_list |
Coerce xml nodes to a list. |
read_xml |
Read HTML or XML. |
url_absolute |
Convert between relative and absolute urls. |
url_escape |
Escape and unescape urls. |
url_parse |
Parse a url into its component pieces. |
write_xml |
Write XML to disk. |
xml_attr |
Retrieve an attribute. |
xml_children |
Navigate around the family tree. |
xml_find_all |
Find nodes that match an xpath expression. |
xml_name |
The (tag) name of an xml element. |
xml_name<- |
Modify the (tag) name of an element |
xml_new_document |
Create a new document |
xml_ns |
XML namespaces. |
xml_ns_strip |
Strip the default namespaces from a document |
xml_path |
Retrieve the xpath to a node |
xml_replace |
Modify a tree by inserting, replacing or removing nodes |
xml_set_namespace |
Set the node's namespace |
xml_structure |
Show the structure of an html/xml document. |
xml_text |
Extract or modify the text |
xml_type |
Determine the type of a node. |
xml_url |
The URL of an XML document |
as_list
xmlノードのlist型への強制変換
> as_list(read_xml("<foo> a <b /><c><![CDATA[<d></d>]]></c></foo>"))
[[1]]
[1] " a "
$b
list()
$c
$c[[1]]
[1] "<d></d>"
> as_list(read_xml("<foo> <bar><baz /></bar> </foo>"))
$bar
$bar$baz
list()
> as_list(read_xml("<foo id = 'a'></foo>"))
list()
attr(,"id")
[1] "a"
> as_list(read_xml("<foo><bar id='a'/><bar id='b'/></foo>"))
$bar
list()
attr(,"id")
[1] "a"
$bar
list()
attr(,"id")
[1] "b"
read_xml
HTMLやXMLを読み込む
Arguments
- x
- encoding
- ...
- as_html
- base_url
- n
- verbose
> read_xml("http://www.xmlfiles.com/examples/cd_catalog.xml")
{xml_document}
<CATALOG>
[1] <CD>\n <TITLE>Empire Burlesque</TITLE>\n <ARTIST>Bob Dylan</ARTIS ...
[2] <CD>\n <TITLE>Hide your heart</TITLE>\n <ARTIST>Bonnie Tylor</ART ...
[3] <CD>\n <TITLE>Greatest Hits</TITLE>\n <ARTIST>Dolly Parton</ARTIS ...
[4] <CD>\n <TITLE>Still got the blues</TITLE>\n <ARTIST>Gary More</AR ...
[5] <CD>\n <TITLE>Eros</TITLE>\n <ARTIST>Eros Ramazzotti</ARTIST>\n ...
[6] <CD>\n <TITLE>One night only</TITLE>\n <ARTIST>Bee Gees</ARTIST>\ ...
[7] <CD>\n <TITLE>Sylvias Mother</TITLE>\n <ARTIST>Dr.Hook</ARTIST>\n ...
[8] <CD>\n <TITLE>Maggie May</TITLE>\n <ARTIST>Rod Stewart</ARTIST>\n ...
[9] <CD>\n <TITLE>Romanza</TITLE>\n <ARTIST>Andrea Bocelli</ARTIST>\n ...
[10] <CD>\n <TITLE>When a man loves a woman</TITLE>\n <ARTIST>Percy Sl ...
[11] <CD>\n <TITLE>Black angel</TITLE>\n <ARTIST>Savage Rose</ARTIST>\ ...
[12] <CD>\n <TITLE>1999 Grammy Nominees</TITLE>\n <ARTIST>Many</ARTIST ...
[13] <CD>\n <TITLE>For the good times</TITLE>\n <ARTIST>Kenny Rogers</ ...
[14] <CD>\n <TITLE>Big Willie style</TITLE>\n <ARTIST>Will Smith</ARTI ...
[15] <CD>\n <TITLE>Tupelo Honey</TITLE>\n <ARTIST>Van Morrison</ARTIST ...
[16] <CD>\n <TITLE>Soulsville</TITLE>\n <ARTIST>Jorn Hoel</ARTIST>\n ...
[17] <CD>\n <TITLE>The very best of</TITLE>\n <ARTIST>Cat Stevens</ART ...
[18] <CD>\n <TITLE>Stop</TITLE>\n <ARTIST>Sam Brown</ARTIST>\n <COUNT ...
[19] <CD>\n <TITLE>Bridge of Spies</TITLE>\n <ARTIST>T`Pau</ARTIST>\n ...
[20] <CD>\n <TITLE>Private Dancer</TITLE>\n <ARTIST>Tina Turner</ARTIS ...
...
> read_html("<html><title>Hi")
{xml_document}
<html>
[1] <head>\n <title>Hi</title>\n</head>
> read_html(system.file("extdata", "r-project.html", package = "xml2"))
{xml_document}
<html lang="en">
[1] <head>\n <meta charset="utf-8"/>\n <meta http-equiv="X-UA-Compatib ...
[2] <body>\n <div class="container page">\n <div class="row">\n ...
> read_html(x = "http://had.co.nz")
{xml_document}
<html lang="en">
[1] <head>\n <meta charset="utf-8"/>\n <meta http-equiv="X-UA-Compatib ...
[2] <body id="page-top" class="index">\n\n<!-- Navigation -->\n<nav clas ...
url_escape / url_unescape
URLのエスケープ・アンエスケープ
> url_escape(x = "にほんご")
[1] "%E3%81%AB%E3%81%BB%E3%82%93%E3%81%94"
> url_escape(x = "にほんご") %>% url_unescape()
[1] "にほんご"
url_parse
URLを分解する
> url_parse("http://had.co.nz/")
scheme server port user path query fragment
1 http had.co.nz NA /
> url_parse("http://had.co.nz:1234/")
scheme server port user path query fragment
1 http had.co.nz 1234 /
> url_parse("http://had.co.nz:1234/?a=1&b=2")
scheme server port user path query fragment
1 http had.co.nz 1234 / a=1&b=2
> url_parse("http://had.co.nz:1234/?a=1&b=2#def")
scheme server port user path query fragment
1 http had.co.nz 1234 / a=1&b=2 def
write_xml
XML文書を保存する
> h <- read_html("<p>Hi!</p>")
> tmp <- tempfile(fileext = ".xml")
> write_xml(h, tmp)
> read_xml(tmp)
{xml_document}
<html>
[1] <body>\n <p>Hi!</p>\n</body>
xml_find_all / xml_find_first / xml_find_num / xml_find_chr / xml_find_lgl
> x <- read_xml("<foo><bar><baz/></bar><baz/></foo>")
> xml_find_all(x, ".//baz")
{xml_nodeset (2)}
[1] <baz/>
[2] <baz/>
> xml_find_all(x, ".//baz") %>% xml_path()
[1] "/foo/bar/baz" "/foo/baz"
> xml_find_first(x, "//bar")
{xml_node}
<bar>
[1] <baz/>
xml_name
タグ名を取得する
> read_xml("<bar>123</bar>") %>% xml_name()
[1] "bar"
> (y <- read_xml("<bar><baz>1</baz>abc<foo /></bar>"))
{xml_document}
<bar>
[1] <baz>1</baz>
[2] <foo/>
> xml_children(y)
{xml_nodeset (2)}
[1] <baz>1</baz>
[2] <foo/>
> xml_children(y) %>% xml_name()
[1] "baz" "foo"
xml_ns / xml_ns_rename
XMLの名前空間
> xml_ns
function (x)
{
UseMethod("xml_ns")
}
<environment: namespace:xml2>
xml_path
ノードのxpathから取得する
> read_xml("<foo><bar><baz /></bar><baz /></foo>") %>% xml_find_all(., ".//baz") %>% xml_path()
[1] "/foo/bar/baz" "/foo/baz"
xml_structure / html_structure
xml / html書類の構造を示す
> xml_structure(read_xml("<a><b><c/><c/></b><d/></a>"))
<a>
<b>
<c>
<c>
<d>
> read_html(system.file("extdata","r-project.html", package = "xml2")) %>% {
+ xml_structure(.)
+ html_structure(.)
+ }
<html [lang]>
<head>
<meta [charset]>
<meta [http-equiv, content]>
<meta [name, content]>
<title>
{text}
<link [rel, type, href, sizes]>
<link [rel, type, href, sizes]>
{comment}
<link [href, rel]>
<link [href, rel]>
{comment}
{comment}
{comment}
<body>
{text}
<div [class]>
{text}
<div [class]>
{text}
<div [class, role]>
{text}
<div [class]>
{text}
<div [class]>
{text}
<p>
<a [href]>
<img [src, alt]>
{text}
<p>
<small>
<a [href]>
{text}
{text}
<h2>
{text}
{text}
<p>
<a [href]>
{text}
{text}
<h2>
{text}
{text}
<ul>
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
{text}
<div [class]>
{text}
<h2>
{text}
{text}
<ul>
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
{text}
<div [class]>
{text}
<h2>
{text}
{text}
<ul>
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
{text}
<div [class]>
{text}
<h2>
{text}
{text}
<ul>
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
{text}
{text}
{text}
<div [class]>
{text}
<h1>
{text}
{text}
<h2 [id]>
{text}
{text}
<p>
{text}
<strong>
<a [href]>
{text}
{text}
<a [href]>
{text}
{text}
{text}
<p>
{text}
<a [href]>
{text}
{text}
{text}
<h2 [id]>
{text}
{text}
<ul>
<li>
<p>
<a [href]>
<strong>
{text}
{text}
{text}
<li>
<p>
<strong>
{text}
{text}
{text}
<li>
<p>
<a [href]>
<strong>
{text}
{text}
{text}
<li>
<p>
<strong>
{text}
{text}
{text}
<li>
<p>
<strong>
<a [href]>
{text}
{text}
{text}
<li>
<p>
<strong>
<a [href]>
{text}
{text}
{text}
{comment}
{text}
{text}
<div [class]>
{text}
{text}
{text}
{comment}
{text}
<script [src]>
{comment}
<script [src]>
<html [lang]>
<head>
<meta [charset]>
<meta [http-equiv, content]>
<meta [name, content]>
<title>
{text}
<link [rel, type, href, sizes]>
<link [rel, type, href, sizes]>
{comment}
<link [href, rel]>
<link [href, rel]>
{comment}
{comment}
{comment}
<body>
{text}
<div.container.page>
{text}
<div.row>
{text}
<div.col-xs-12.col-sm-offset-1.col-sm-2.sidebar [role]>
{text}
<div.row>
{text}
<div.col-xs-4.col-sm-12>
{text}
<p>
<a [href]>
<img [src, alt]>
{text}
<p>
<small>
<a [href]>
{text}
{text}
<h2>
{text}
{text}
<p>
<a [href]>
{text}
{text}
<h2>
{text}
{text}
<ul>
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
{text}
<div.col-xs-4.col-sm-12>
{text}
<h2>
{text}
{text}
<ul>
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
{text}
<div.col-xs-4.col-sm-12>
{text}
<h2>
{text}
{text}
<ul>
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
{text}
<div.col-xs-4.col-sm-12>
{text}
<h2>
{text}
{text}
<ul>
<li>
<a [href]>
{text}
{text}
<li>
<a [href]>
{text}
{text}
{text}
{text}
{text}
<div.col-xs-12.col-sm-7>
{text}
<h1>
{text}
{text}
<h2#getting-started>
{text}
{text}
<p>
{text}
<strong>
<a [href]>
{text}
{text}
<a [href]>
{text}
{text}
{text}
<p>
{text}
<a [href]>
{text}
{text}
{text}
<h2#news>
{text}
{text}
<ul>
<li>
<p>
<a [href]>
<strong>
{text}
{text}
{text}
<li>
<p>
<strong>
{text}
{text}
{text}
<li>
<p>
<a [href]>
<strong>
{text}
{text}
{text}
<li>
<p>
<strong>
{text}
{text}
{text}
<li>
<p>
<strong>
<a [href]>
{text}
{text}
{text}
<li>
<p>
<strong>
<a [href]>
{text}
{text}
{text}
{comment}
{text}
{text}
<div.raw.footer>
{text}
{text}
{text}
{comment}
{text}
<script [src]>
{comment}
<script [src]>
xml_text
xmlの文章を抽出する
> read_xml("<p>This is some text. This is <b>bold!</b></p>") %>% xml_text()
[1] "This is some text. This is bold!"
> read_xml("<x>This is some text. <x>This is some nested text.</x></x>") %>%
+ xml_find_all(., "//x") %>%
+ xml_text()
[1] "This is some text. This is some nested text."
[2] "This is some nested text."
> read_xml("<p> Some text </p>") %>% xml_text(trim = TRUE)
[1] "Some text"
xml_type
> read_xml("<foo> a <b /> <![CDATA[ blah]]></foo>") %>% xml_type()
[1] "element"
xml_url
> read_xml("http://www.xmlfiles.com/examples/cd_catalog.xml") %>% xml_url()
[1] "http://www.xmlfiles.com/examples/cd_catalog.xml"