xml2: Parse XML

より簡易な手法でR上でXMLやHTMLを分析する

> library(xml2)

バージョン: 1.0.0


関数名 概略
as_list Coerce xml nodes to a list.
read_xml Read HTML or XML.
url_absolute Convert between relative and absolute urls.
url_escape Escape and unescape urls.
url_parse Parse a url into its component pieces.
write_xml Write XML to disk.
xml_attr Retrieve an attribute.
xml_children Navigate around the family tree.
xml_find_all Find nodes that match an xpath expression.
xml_name The (tag) name of an xml element.
xml_name<- Modify the (tag) name of an element
xml_new_document Create a new document
xml_ns XML namespaces.
xml_ns_strip Strip the default namespaces from a document
xml_path Retrieve the xpath to a node
xml_replace Modify a tree by inserting, replacing or removing nodes
xml_set_namespace Set the node's namespace
xml_structure Show the structure of an html/xml document.
xml_text Extract or modify the text
xml_type Determine the type of a node.
xml_url The URL of an XML document

as_list

xmlノードのlist型への強制変換

> as_list(read_xml("<foo> a <b /><c><![CDATA[<d></d>]]></c></foo>"))
[[1]]
[1] " a "

$b
list()

$c
$c[[1]]
[1] "<d></d>"
> as_list(read_xml("<foo> <bar><baz /></bar> </foo>"))
$bar
$bar$baz
list()
> as_list(read_xml("<foo id = 'a'></foo>"))
list()
attr(,"id")
[1] "a"
> as_list(read_xml("<foo><bar id='a'/><bar id='b'/></foo>"))
$bar
list()
attr(,"id")
[1] "a"

$bar
list()
attr(,"id")
[1] "b"

read_xml

HTMLやXMLを読み込む

Arguments

  • x
  • encoding
  • ...
  • as_html
  • base_url
  • n
  • verbose
> read_xml("http://www.xmlfiles.com/examples/cd_catalog.xml")
{xml_document}
<CATALOG>
 [1] <CD>\n  <TITLE>Empire Burlesque</TITLE>\n  <ARTIST>Bob Dylan</ARTIS ...
 [2] <CD>\n  <TITLE>Hide your heart</TITLE>\n  <ARTIST>Bonnie Tylor</ART ...
 [3] <CD>\n  <TITLE>Greatest Hits</TITLE>\n  <ARTIST>Dolly Parton</ARTIS ...
 [4] <CD>\n  <TITLE>Still got the blues</TITLE>\n  <ARTIST>Gary More</AR ...
 [5] <CD>\n  <TITLE>Eros</TITLE>\n  <ARTIST>Eros Ramazzotti</ARTIST>\n   ...
 [6] <CD>\n  <TITLE>One night only</TITLE>\n  <ARTIST>Bee Gees</ARTIST>\ ...
 [7] <CD>\n  <TITLE>Sylvias Mother</TITLE>\n  <ARTIST>Dr.Hook</ARTIST>\n ...
 [8] <CD>\n  <TITLE>Maggie May</TITLE>\n  <ARTIST>Rod Stewart</ARTIST>\n ...
 [9] <CD>\n  <TITLE>Romanza</TITLE>\n  <ARTIST>Andrea Bocelli</ARTIST>\n ...
[10] <CD>\n  <TITLE>When a man loves a woman</TITLE>\n  <ARTIST>Percy Sl ...
[11] <CD>\n  <TITLE>Black angel</TITLE>\n  <ARTIST>Savage Rose</ARTIST>\ ...
[12] <CD>\n  <TITLE>1999 Grammy Nominees</TITLE>\n  <ARTIST>Many</ARTIST ...
[13] <CD>\n  <TITLE>For the good times</TITLE>\n  <ARTIST>Kenny Rogers</ ...
[14] <CD>\n  <TITLE>Big Willie style</TITLE>\n  <ARTIST>Will Smith</ARTI ...
[15] <CD>\n  <TITLE>Tupelo Honey</TITLE>\n  <ARTIST>Van Morrison</ARTIST ...
[16] <CD>\n  <TITLE>Soulsville</TITLE>\n  <ARTIST>Jorn Hoel</ARTIST>\n   ...
[17] <CD>\n  <TITLE>The very best of</TITLE>\n  <ARTIST>Cat Stevens</ART ...
[18] <CD>\n  <TITLE>Stop</TITLE>\n  <ARTIST>Sam Brown</ARTIST>\n  <COUNT ...
[19] <CD>\n  <TITLE>Bridge of Spies</TITLE>\n  <ARTIST>T`Pau</ARTIST>\n  ...
[20] <CD>\n  <TITLE>Private Dancer</TITLE>\n  <ARTIST>Tina Turner</ARTIS ...
...
> read_html("<html><title>Hi")
{xml_document}
<html>
[1] <head>\n  <title>Hi</title>\n</head>
> read_html(system.file("extdata", "r-project.html", package = "xml2"))
{xml_document}
<html lang="en">
[1] <head>\n  <meta charset="utf-8"/>\n  <meta http-equiv="X-UA-Compatib ...
[2] <body>\n    <div class="container page">\n      <div class="row">\n  ...
> read_html(x = "http://had.co.nz")
{xml_document}
<html lang="en">
[1] <head>\n  <meta charset="utf-8"/>\n  <meta http-equiv="X-UA-Compatib ...
[2] <body id="page-top" class="index">\n\n<!-- Navigation -->\n<nav clas ...

url_escape / url_unescape

URLのエスケープ・アンエスケープ

> url_escape(x = "にほんご")
[1] "%E3%81%AB%E3%81%BB%E3%82%93%E3%81%94"
> url_escape(x = "にほんご") %>% url_unescape()
[1] "にほんご"

url_parse

URLを分解する

> url_parse("http://had.co.nz/")
  scheme    server port user path query fragment
1   http had.co.nz   NA         /
> url_parse("http://had.co.nz:1234/")
  scheme    server port user path query fragment
1   http had.co.nz 1234         /
> url_parse("http://had.co.nz:1234/?a=1&b=2")
  scheme    server port user path   query fragment
1   http had.co.nz 1234         / a=1&b=2
> url_parse("http://had.co.nz:1234/?a=1&b=2#def")
  scheme    server port user path   query fragment
1   http had.co.nz 1234         / a=1&b=2      def

write_xml

XML文書を保存する

> h <- read_html("<p>Hi!</p>")
> tmp <- tempfile(fileext = ".xml")
> write_xml(h, tmp)
> read_xml(tmp)
{xml_document}
<html>
[1] <body>\n  <p>Hi!</p>\n</body>

xml_find_all / xml_find_first / xml_find_num / xml_find_chr / xml_find_lgl

> x <- read_xml("<foo><bar><baz/></bar><baz/></foo>")
> xml_find_all(x, ".//baz")
{xml_nodeset (2)}
[1] <baz/>
[2] <baz/>
> xml_find_all(x, ".//baz") %>% xml_path()
[1] "/foo/bar/baz" "/foo/baz"
> xml_find_first(x, "//bar")
{xml_node}
<bar>
[1] <baz/>

xml_name

タグ名を取得する

> read_xml("<bar>123</bar>") %>% xml_name()
[1] "bar"
> (y <- read_xml("<bar><baz>1</baz>abc<foo /></bar>"))
{xml_document}
<bar>
[1] <baz>1</baz>
[2] <foo/>
> xml_children(y)
{xml_nodeset (2)}
[1] <baz>1</baz>
[2] <foo/>
> xml_children(y) %>% xml_name()
[1] "baz" "foo"

xml_ns / xml_ns_rename

XMLの名前空間

> xml_ns
function (x) 
{
    UseMethod("xml_ns")
}
<environment: namespace:xml2>

xml_path

ノードのxpathから取得する

> read_xml("<foo><bar><baz /></bar><baz /></foo>") %>% xml_find_all(., ".//baz") %>% xml_path()
[1] "/foo/bar/baz" "/foo/baz"

xml_structure / html_structure

xml / html書類の構造を示す

> xml_structure(read_xml("<a><b><c/><c/></b><d/></a>"))
<a>
  <b>
    <c>
    <c>
  <d>
> read_html(system.file("extdata","r-project.html", package = "xml2")) %>% {
+   xml_structure(.)
+   html_structure(.)
+ }
<html [lang]>
  <head>
    <meta [charset]>
    <meta [http-equiv, content]>
    <meta [name, content]>
    <title>
      {text}
    <link [rel, type, href, sizes]>
    <link [rel, type, href, sizes]>
    {comment}
    <link [href, rel]>
    <link [href, rel]>
    {comment}
    {comment}
    {comment}
  <body>
    {text}
    <div [class]>
      {text}
      <div [class]>
        {text}
        <div [class, role]>
          {text}
          <div [class]>
            {text}
            <div [class]>
              {text}
              <p>
                <a [href]>
                  <img [src, alt]>
              {text}
              <p>
                <small>
                  <a [href]>
                    {text}
              {text}
              <h2>
                {text}
              {text}
              <p>
                <a [href]>
                  {text}
              {text}
              <h2>
                {text}
              {text}
              <ul>
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
            {text}
            <div [class]>
              {text}
              <h2>
                {text}
              {text}
              <ul>
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
            {text}
            <div [class]>
              {text}
              <h2>
                {text}
              {text}
              <ul>
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
            {text}
            <div [class]>
              {text}
              <h2>
                {text}
              {text}
              <ul>
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
            {text}
          {text}
        {text}
        <div [class]>
          {text}
          <h1>
            {text}
          {text}
          <h2 [id]>
            {text}
          {text}
          <p>
            {text}
            <strong>
              <a [href]>
                {text}
            {text}
            <a [href]>
              {text}
            {text}
          {text}
          <p>
            {text}
            <a [href]>
              {text}
            {text}
          {text}
          <h2 [id]>
            {text}
          {text}
          <ul>
            <li>
              <p>
                <a [href]>
                  <strong>
                    {text}
                {text}
            {text}
            <li>
              <p>
                <strong>
                  {text}
                {text}
            {text}
            <li>
              <p>
                <a [href]>
                  <strong>
                    {text}
                {text}
            {text}
            <li>
              <p>
                <strong>
                  {text}
                {text}
            {text}
            <li>
              <p>
                <strong>
                  <a [href]>
                    {text}
                {text}
            {text}
            <li>
              <p>
                <strong>
                  <a [href]>
                    {text}
                {text}
            {text}
          {comment}
        {text}
      {text}
      <div [class]>
        {text}
      {text}
    {text}
    {comment}
    {text}
    <script [src]>
    {comment}
    <script [src]>
<html [lang]>
  <head>
    <meta [charset]>
    <meta [http-equiv, content]>
    <meta [name, content]>
    <title>
      {text}
    <link [rel, type, href, sizes]>
    <link [rel, type, href, sizes]>
    {comment}
    <link [href, rel]>
    <link [href, rel]>
    {comment}
    {comment}
    {comment}
  <body>
    {text}
    <div.container.page>
      {text}
      <div.row>
        {text}
        <div.col-xs-12.col-sm-offset-1.col-sm-2.sidebar [role]>
          {text}
          <div.row>
            {text}
            <div.col-xs-4.col-sm-12>
              {text}
              <p>
                <a [href]>
                  <img [src, alt]>
              {text}
              <p>
                <small>
                  <a [href]>
                    {text}
              {text}
              <h2>
                {text}
              {text}
              <p>
                <a [href]>
                  {text}
              {text}
              <h2>
                {text}
              {text}
              <ul>
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
            {text}
            <div.col-xs-4.col-sm-12>
              {text}
              <h2>
                {text}
              {text}
              <ul>
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
            {text}
            <div.col-xs-4.col-sm-12>
              {text}
              <h2>
                {text}
              {text}
              <ul>
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
            {text}
            <div.col-xs-4.col-sm-12>
              {text}
              <h2>
                {text}
              {text}
              <ul>
                <li>
                  <a [href]>
                    {text}
                {text}
                <li>
                  <a [href]>
                    {text}
                {text}
            {text}
          {text}
        {text}
        <div.col-xs-12.col-sm-7>
          {text}
          <h1>
            {text}
          {text}
          <h2#getting-started>
            {text}
          {text}
          <p>
            {text}
            <strong>
              <a [href]>
                {text}
            {text}
            <a [href]>
              {text}
            {text}
          {text}
          <p>
            {text}
            <a [href]>
              {text}
            {text}
          {text}
          <h2#news>
            {text}
          {text}
          <ul>
            <li>
              <p>
                <a [href]>
                  <strong>
                    {text}
                {text}
            {text}
            <li>
              <p>
                <strong>
                  {text}
                {text}
            {text}
            <li>
              <p>
                <a [href]>
                  <strong>
                    {text}
                {text}
            {text}
            <li>
              <p>
                <strong>
                  {text}
                {text}
            {text}
            <li>
              <p>
                <strong>
                  <a [href]>
                    {text}
                {text}
            {text}
            <li>
              <p>
                <strong>
                  <a [href]>
                    {text}
                {text}
            {text}
          {comment}
        {text}
      {text}
      <div.raw.footer>
        {text}
      {text}
    {text}
    {comment}
    {text}
    <script [src]>
    {comment}
    <script [src]>

xml_text

xmlの文章を抽出する

> read_xml("<p>This is some text. This is <b>bold!</b></p>") %>% xml_text()
[1] "This is some text. This is bold!"
> read_xml("<x>This is some text. <x>This is some nested text.</x></x>") %>% 
+   xml_find_all(., "//x") %>% 
+   xml_text()
[1] "This is some text. This is some nested text."
[2] "This is some nested text."
> read_xml("<p>   Some text    </p>") %>% xml_text(trim = TRUE)
[1] "Some text"

xml_type

> read_xml("<foo> a <b /> <![CDATA[ blah]]></foo>") %>% xml_type()
[1] "element"

xml_url

> read_xml("http://www.xmlfiles.com/examples/cd_catalog.xml") %>% xml_url()
[1] "http://www.xmlfiles.com/examples/cd_catalog.xml"