tabulizer: Bindings for Tabula PDF Table Extractor Library
- CRAN: http://cran.r-project.org/web/packages/tabulizer/index.html
- GitHub: https://github.com/ropenscilabs/tabulizer
- Vignettes:
> library(tabulizer)
> f <- system.file("examples", "data.pdf", package = "tabulizer")
バージョン: 0.1.22
関数名 | 概略 |
---|---|
extract_metadata |
extract_metadata |
extract_tables |
extract_tables |
extract_text |
extract_text |
get_page_dims |
Page length and dimensions |
locate_areas |
extract_areas |
make_thumbnails |
make_thumbnails |
split_pdf |
Split and merge PDFs |
stop_logging |
rJava logging |
tabulizer-package |
tabulizer |
extract_metadata
メタ情報の抽出
> extract_metadata(f)
$pages
[1] 3
$title
NULL
$author
NULL
$subject
NULL
$keywords
NULL
$creator
[1] "TeX"
$producer
[1] "MiKTeX pdfTeX-1.40.16"
$created
[1] "Sat Apr 30 00:40:40 JST 2016"
$modified
[1] "Sat Apr 30 00:40:40 JST 2016"
$trapped
[1] "False"
extract_tables
ファイルからの表データ抽出
Arguments
- file
- pages
- area
- columns
- guess
- spreadsheet
- method
- password
- encoding
- ...
> extract_tables(f) # すべてのページの表から行列のリストで取得
[[1]]
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
[2,] "21.0" "6" "160.0" "110" "3.90" "2.620" "16.46" "0" "1" "4"
[3,] "21.0" "6" "160.0" "110" "3.90" "2.875" "17.02" "0" "1" "4"
[4,] "22.8" "4" "108.0" "93" "3.85" "2.320" "18.61" "1" "1" "4"
[5,] "21.4" "6" "258.0" "110" "3.08" "3.215" "19.44" "1" "0" "3"
[6,] "18.7" "8" "360.0" "175" "3.15" "3.440" "17.02" "0" "0" "3"
[7,] "18.1" "6" "225.0" "105" "2.76" "3.460" "20.22" "1" "0" "3"
[8,] "14.3" "8" "360.0" "245" "3.21" "3.570" "15.84" "0" "0" "3"
[9,] "24.4" "4" "146.7" "62" "3.69" "3.190" "20.00" "1" "0" "4"
[10,] "22.8" "4" "140.8" "95" "3.92" "3.150" "22.90" "1" "0" "4"
[11,] "19.2" "6" "167.6" "123" "3.92" "3.440" "18.30" "1" "0" "4"
[12,] "17.8" "6" "167.6" "123" "3.92" "3.440" "18.90" "1" "0" "4"
[13,] "16.4" "8" "275.8" "180" "3.07" "4.070" "17.40" "0" "0" "3"
[14,] "17.3" "8" "275.8" "180" "3.07" "3.730" "17.60" "0" "0" "3"
[15,] "15.2" "8" "275.8" "180" "3.07" "3.780" "18.00" "0" "0" "3"
[16,] "10.4" "8" "472.0" "205" "2.93" "5.250" "17.98" "0" "0" "3"
[17,] "10.4" "8" "460.0" "215" "3.00" "5.424" "17.82" "0" "0" "3"
[18,] "14.7" "8" "440.0" "230" "3.23" "5.345" "17.42" "0" "0" "3"
[19,] "32.4" "4" "78.7" "66" "4.08" "2.200" "19.47" "1" "1" "4"
[20,] "30.4" "4" "75.7" "52" "4.93" "1.615" "18.52" "1" "1" "4"
[21,] "33.9" "4" "71.1" "65" "4.22" "1.835" "19.90" "1" "1" "4"
[22,] "21.5" "4" "120.1" "97" "3.70" "2.465" "20.01" "1" "0" "3"
[23,] "15.5" "8" "318.0" "150" "2.76" "3.520" "16.87" "0" "0" "3"
[24,] "15.2" "8" "304.0" "150" "3.15" "3.435" "17.30" "0" "0" "3"
[25,] "13.3" "8" "350.0" "245" "3.73" "3.840" "15.41" "0" "0" "3"
[26,] "19.2" "8" "400.0" "175" "3.08" "3.845" "17.05" "0" "0" "3"
[27,] "27.3" "4" "79.0" "66" "4.08" "1.935" "18.90" "1" "1" "4"
[28,] "26.0" "4" "120.3" "91" "4.43" "2.140" "16.70" "0" "1" "5"
[29,] "30.4" "4" "95.1" "113" "3.77" "1.513" "16.90" "1" "1" "5"
[30,] "15.8" "8" "351.0" "264" "4.22" "3.170" "14.50" "0" "1" "5"
[31,] "19.7" "6" "145.0" "175" "3.62" "2.770" "15.50" "0" "1" "5"
[32,] "15.0" "8" "301.0" "335" "3.54" "3.570" "14.60" "0" "1" "5"
[[2]]
[,1] [,2] [,3]
[1,] "Sepal.Width" "Petal.Length" "Petal.Width"
[2,] "3.5" "1.4" "0.2"
[3,] "3.0" "1.4" "0.2"
[4,] "3.2" "1.3" "0.2"
[5,] "3.1" "1.5" "0.2"
[6,] "3.6" "1.4" "0.2"
[7,] "3.9" "1.7" "0.4"
[[3]]
[,1]
[1,] "supp"
[2,] "VC"
[3,] "VC"
[4,] "VC"
[5,] "VC"
[6,] "VC"
[7,] "VC"
[8,] "VC"
[9,] "VC"
[10,] "VC"
[11,] "VC"
[12,] "VC"
[13,] "VC"
[14,] "VC"
[15,] "VC"
> extract_tables(f, pages = 2, method = "data.frame")
[[1]]
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
[[2]]
X Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 145 6.7 3.3 5.7 2.5 virginica
2 146 6.7 3.0 5.2 2.3 virginica
3 147 6.3 2.5 5.0 1.9 virginica
4 148 6.5 3.0 5.2 2.0 virginica
5 149 6.2 3.4 5.4 2.3 virginica
6 150 5.9 3.0 5.1 1.8 virginica
> # 各ページの表データをcsvとして保存する
> # extract_tables(f, method = "csv")
> # json, tsvも選択可
extract_text
get_page_dims
> get_page_dims(f)
[[1]]
[1] 612 792
[[2]]
[1] 612 792
[[3]]
[1] 612 792
locate_areas / extract_areas
インタラクティブに選択範囲を操作
> locate_areas(f)
> extract_areas(f) # Shiny Widgets
make_thumbnails
PDFの各ページのサムネイル画像を生成する(PDFのあるディレクトリにpngが作成される)
split_pdf / merge_pdfs
PDFの分割と結合
> sf <- split_pdf(f)
> merge_pdfs(sf, "merged.pdf")
stop_logging
> stop_logging()