tokenizers: Tokenize Text
文字のトークナイズ(字句解析)
- CRAN: http://cran.r-project.org/web/packages/tokenizers/index.html
- GitHub: https://github.com/lmullen/tokenizers
> library(tokenizers)
Attaching package: 'tokenizers'
The following object is masked from 'package:tm':
stopwords
バージョン: 0.1.3
関数名 | 概略 |
---|---|
basic-tokenizers |
Basic tokenizers |
ngram-tokenizers |
N-gram tokenizers |
stopwords |
Stopword lists |
tokenize_word_stems |
Word stem tokenizers |
tokenizers |
tokenizers |
basic-tokenizers
テキスト区切り
Arguments
- x
- lowercase
- strip_non_alphanum
- simplify
- stopwords
- strip_punctuation
- paragraph_break
- pattern
> (text <- library(help = "tokenizers")$info[[1]][6])
[1] "License: MIT + file LICENSE"
> # 文字区切り
> tokenize_characters(text) %>%
+ head(20)
[[1]]
[1] "l" "i" "c" "e" "n" "s" "e" "m" "i" "t" "+" "f" "i" "l" "e" "l" "i"
[18] "c" "e" "n" "s" "e"
> # 単語区切り
> tokenize_words(text)
[[1]]
[1] "license" "mit" "file" "license"
> # 文章区切り
> tokenize_sentences(text)
[[1]]
[1] "License: MIT + file LICENSE"
> # 段落区切り
> tokenize_paragraphs(text)
[[1]]
[1] "License: MIT + file LICENSE"
> # 行区切り
> tokenize_lines(text)
[[1]]
[1] "License: MIT + file LICENSE"
> tokenize_regex("A,B,C,D,E", pattern = ",")
[[1]]
[1] "A" "B" "C" "D" "E"
> text <- "メロスは激怒した。必ず、かのじゃちぼうぎゃく邪智暴虐の王を除かなければならぬと決意した。"
> tokenize_words(text)
[[1]]
[1] "メロス" "は" "激怒" "した" "必ず" "か" "の"
[8] "じゃ" "ち" "ぼう" "ぎゃく" "邪智" "暴虐" "の"
[15] "王" "を" "除" "かな" "け" "れ" "ば"
[22] "なら" "ぬ" "と" "決意" "した"
> tokenize_sentences(text)
[[1]]
[1] "メロスは激怒した。"
[2] "必ず、かのじゃちぼうぎゃく邪智暴虐の王を除かなければならぬと決意した。"
ngram-tokenizers
Arguments
- x
- lowercase
- n
- n_min
- stopwords
- ngram_delim
- simplify
- k
> song <- paste0("How many roads must a man walk down\n",
+ "Before you call him a man?\n",
+ "How many seas must a white dove sail\n",
+ "Before she sleeps in the sand?\n",
+ "\n",
+ "How many times must the cannonballs fly\n",
+ "Before they're forever banned?\n",
+ "The answer, my friend, is blowin' in the wind.\n",
+ "The answer is blowin' in the wind.\n")
>
> tokenize_ngrams(song, n = 4)
[[1]]
[1] "how many roads must" "many roads must a"
[3] "roads must a man" "must a man walk"
[5] "a man walk down" "man walk down before"
[7] "walk down before you" "down before you call"
[9] "before you call him" "you call him a"
[11] "call him a man" "him a man how"
[13] "a man how many" "man how many seas"
[15] "how many seas must" "many seas must a"
[17] "seas must a white" "must a white dove"
[19] "a white dove sail" "white dove sail before"
[21] "dove sail before she" "sail before she sleeps"
[23] "before she sleeps in" "she sleeps in the"
[25] "sleeps in the sand" "in the sand how"
[27] "the sand how many" "sand how many times"
[29] "how many times must" "many times must the"
[31] "times must the cannonballs" "must the cannonballs fly"
[33] "the cannonballs fly before" "cannonballs fly before they're"
[35] "fly before they're forever" "before they're forever banned"
[37] "they're forever banned the" "forever banned the answer"
[39] "banned the answer my" "the answer my friend"
[41] "answer my friend is" "my friend is blowin"
[43] "friend is blowin in" "is blowin in the"
[45] "blowin in the wind" "in the wind the"
[47] "the wind the answer" "wind the answer is"
[49] "the answer is blowin" "answer is blowin in"
[51] "is blowin in the" "blowin in the wind"
> tokenize_ngrams(song, n = 4, n_min = 1)
[[1]]
[1] "how" "how many"
[3] "how many roads" "how many roads must"
[5] "many" "many roads"
[7] "many roads must" "many roads must a"
[9] "roads" "roads must"
[11] "roads must a" "roads must a man"
[13] "must" "must a"
[15] "must a man" "must a man walk"
[17] "a" "a man"
[19] "a man walk" "a man walk down"
[21] "man" "man walk"
[23] "man walk down" "man walk down before"
[25] "walk" "walk down"
[27] "walk down before" "walk down before you"
[29] "down" "down before"
[31] "down before you" "down before you call"
[33] "before" "before you"
[35] "before you call" "before you call him"
[37] "you" "you call"
[39] "you call him" "you call him a"
[41] "call" "call him"
[43] "call him a" "call him a man"
[45] "him" "him a"
[47] "him a man" "him a man how"
[49] "a" "a man"
[51] "a man how" "a man how many"
[53] "man" "man how"
[55] "man how many" "man how many seas"
[57] "how" "how many"
[59] "how many seas" "how many seas must"
[61] "many" "many seas"
[63] "many seas must" "many seas must a"
[65] "seas" "seas must"
[67] "seas must a" "seas must a white"
[69] "must" "must a"
[71] "must a white" "must a white dove"
[73] "a" "a white"
[75] "a white dove" "a white dove sail"
[77] "white" "white dove"
[79] "white dove sail" "white dove sail before"
[81] "dove" "dove sail"
[83] "dove sail before" "dove sail before she"
[85] "sail" "sail before"
[87] "sail before she" "sail before she sleeps"
[89] "before" "before she"
[91] "before she sleeps" "before she sleeps in"
[93] "she" "she sleeps"
[95] "she sleeps in" "she sleeps in the"
[97] "sleeps" "sleeps in"
[99] "sleeps in the" "sleeps in the sand"
[101] "in" "in the"
[103] "in the sand" "in the sand how"
[105] "the" "the sand"
[107] "the sand how" "the sand how many"
[109] "sand" "sand how"
[111] "sand how many" "sand how many times"
[113] "how" "how many"
[115] "how many times" "how many times must"
[117] "many" "many times"
[119] "many times must" "many times must the"
[121] "times" "times must"
[123] "times must the" "times must the cannonballs"
[125] "must" "must the"
[127] "must the cannonballs" "must the cannonballs fly"
[129] "the" "the cannonballs"
[131] "the cannonballs fly" "the cannonballs fly before"
[133] "cannonballs" "cannonballs fly"
[135] "cannonballs fly before" "cannonballs fly before they're"
[137] "fly" "fly before"
[139] "fly before they're" "fly before they're forever"
[141] "before" "before they're"
[143] "before they're forever" "before they're forever banned"
[145] "they're" "they're forever"
[147] "they're forever banned" "they're forever banned the"
[149] "forever" "forever banned"
[151] "forever banned the" "forever banned the answer"
[153] "banned" "banned the"
[155] "banned the answer" "banned the answer my"
[157] "the" "the answer"
[159] "the answer my" "the answer my friend"
[161] "answer" "answer my"
[163] "answer my friend" "answer my friend is"
[165] "my" "my friend"
[167] "my friend is" "my friend is blowin"
[169] "friend" "friend is"
[171] "friend is blowin" "friend is blowin in"
[173] "is" "is blowin"
[175] "is blowin in" "is blowin in the"
[177] "blowin" "blowin in"
[179] "blowin in the" "blowin in the wind"
[181] "in" "in the"
[183] "in the wind" "in the wind the"
[185] "the" "the wind"
[187] "the wind the" "the wind the answer"
[189] "wind" "wind the"
[191] "wind the answer" "wind the answer is"
[193] "the" "the answer"
[195] "the answer is" "the answer is blowin"
[197] "answer" "answer is"
[199] "answer is blowin" "answer is blowin in"
[201] "is" "is blowin"
[203] "is blowin in" "is blowin in the"
[205] "blowin" "blowin in"
[207] "blowin in the" "blowin in the wind"
[209] "in" "in the"
[211] "in the wind" "the"
[213] "the wind" "wind"
> tokenize_skip_ngrams(song, n = 4, k = 2)
[[1]]
[1] "how must walk you" "many a down call"
[3] "roads man before him" "must walk you a"
[5] "a down call man" "man before him how"
[7] "walk you a many" "down call man seas"
[9] "before him how must" "you a many a"
[11] "call man seas white" "him how must dove"
[13] "a many a sail" "man seas white before"
[15] "how must dove she" "many a sail sleeps"
[17] "seas white before in" "must dove she the"
[19] "a sail sleeps sand" "white before in how"
[21] "dove she the many" "sail sleeps sand times"
[23] "before in how must" "she the many the"
[25] "sleeps sand times cannonballs" "in how must fly"
[27] "the many the before" "sand times cannonballs they're"
[29] "how must fly forever" "many the before banned"
[31] "times cannonballs they're the" "must fly forever answer"
[33] "the before banned my" "cannonballs they're the friend"
[35] "fly forever answer is" "before banned my blowin"
[37] "they're the friend in" "forever answer is the"
[39] "banned my blowin wind" "the friend in the"
[41] "answer is the answer" "my blowin wind is"
[43] "friend in the blowin" "is the answer in"
[45] "blowin wind is the" "in the blowin wind"
[47] "how roads a walk" "many must man down"
[49] "roads a walk before" "must man down you"
[51] "a walk before call" "man down you him"
[53] "walk before call a" "down you him man"
[55] "before call a how" "you him man many"
[57] "call a how seas" "him man many must"
[59] "a how seas a" "man many must white"
[61] "how seas a dove" "many must white sail"
[63] "seas a dove before" "must white sail she"
[65] "a dove before sleeps" "white sail she in"
[67] "dove before sleeps the" "sail she in sand"
[69] "before sleeps the how" "she in sand many"
[71] "sleeps the how times" "in sand many must"
[73] "the how times the" "sand many must cannonballs"
[75] "how times the fly" "many must cannonballs before"
[77] "times the fly they're" "must cannonballs before forever"
[79] "the fly they're banned" "cannonballs before forever the"
[81] "fly they're banned answer" "before forever the my"
[83] "they're banned answer friend" "forever the my is"
[85] "banned answer friend blowin" "the my is in"
[87] "answer friend blowin the" "my is in wind"
[89] "friend blowin the the" "is in wind answer"
[91] "blowin the the is" "in wind answer blowin"
[93] "the the is in" "wind answer blowin the"
[95] "the is in wind" "how many roads must"
[97] "many roads must a" "roads must a man"
[99] "must a man walk" "a man walk down"
[101] "man walk down before" "walk down before you"
[103] "down before you call" "before you call him"
[105] "you call him a" "call him a man"
[107] "him a man how" "a man how many"
[109] "man how many seas" "how many seas must"
[111] "many seas must a" "seas must a white"
[113] "must a white dove" "a white dove sail"
[115] "white dove sail before" "dove sail before she"
[117] "sail before she sleeps" "before she sleeps in"
[119] "she sleeps in the" "sleeps in the sand"
[121] "in the sand how" "the sand how many"
[123] "sand how many times" "how many times must"
[125] "many times must the" "times must the cannonballs"
[127] "must the cannonballs fly" "the cannonballs fly before"
[129] "cannonballs fly before they're" "fly before they're forever"
[131] "before they're forever banned" "they're forever banned the"
[133] "forever banned the answer" "banned the answer my"
[135] "the answer my friend" "answer my friend is"
[137] "my friend is blowin" "friend is blowin in"
[139] "is blowin in the" "blowin in the wind"
[141] "in the wind the" "the wind the answer"
[143] "wind the answer is" "the answer is blowin"
[145] "answer is blowin in" "is blowin in the"
[147] "blowin in the wind"
stopwords
Arguments
- language...
en
,da
,de
,el
,es
,fr
,it
,ru
> stopwords("en")
[1] "a" "an" "and" "are" "as" "at" "be" "but"
[9] "by" "for" "if" "in" "into" "is" "it" "no"
[17] "not" "of" "on" "or" "such" "that" "the" "their"
[25] "then" "there" "these" "they" "this" "to" "was" "will"
[33] "with"
> stopwords("de")
[1] "aber" "alle" "allem" "allen" "aller"
[6] "alles" "als" "also" "am" "an"
[11] "ander" "andere" "anderem" "anderen" "anderer"
[16] "anderes" "anderm" "andern" "anderr" "anders"
[21] "auch" "auf" "aus" "bei" "bin"
[26] "bis" "bist" "da" "damit" "dann"
[31] "der" "den" "des" "dem" "die"
[36] "das" "daß" "derselbe" "derselben" "denselben"
[41] "desselben" "demselben" "dieselbe" "dieselben" "dasselbe"
[46] "dazu" "dein" "deine" "deinem" "deinen"
[51] "deiner" "deines" "denn" "derer" "dessen"
[56] "dich" "dir" "du" "dies" "diese"
[61] "diesem" "diesen" "dieser" "dieses" "doch"
[66] "dort" "durch" "ein" "eine" "einem"
[71] "einen" "einer" "eines" "einig" "einige"
[76] "einigem" "einigen" "einiger" "einiges" "einmal"
[81] "er" "ihn" "ihm" "es" "etwas"
[86] "euer" "eure" "eurem" "euren" "eurer"
[91] "eures" "für" "gegen" "gewesen" "hab"
[96] "habe" "haben" "hat" "hatte" "hatten"
[101] "hier" "hin" "hinter" "ich" "mich"
[106] "mir" "ihr" "ihre" "ihrem" "ihren"
[111] "ihrer" "ihres" "euch" "im" "in"
[116] "indem" "ins" "ist" "jede" "jedem"
[121] "jeden" "jeder" "jedes" "jene" "jenem"
[126] "jenen" "jener" "jenes" "jetzt" "kann"
[131] "kein" "keine" "keinem" "keinen" "keiner"
[136] "keines" "können" "könnte" "machen" "man"
[141] "manche" "manchem" "manchen" "mancher" "manches"
[146] "mein" "meine" "meinem" "meinen" "meiner"
[151] "meines" "mit" "muss" "musste" "nach"
[156] "nicht" "nichts" "noch" "nun" "nur"
[161] "ob" "oder" "ohne" "sehr" "sein"
[166] "seine" "seinem" "seinen" "seiner" "seines"
[171] "selbst" "sich" "sie" "ihnen" "sind"
[176] "so" "solche" "solchem" "solchen" "solcher"
[181] "solches" "soll" "sollte" "sondern" "sonst"
[186] "über" "um" "und" "uns" "unse"
[191] "unsem" "unsen" "unser" "unses" "unter"
[196] "viel" "vom" "von" "vor" "während"
[201] "war" "waren" "warst" "was" "weg"
[206] "weil" "weiter" "welche" "welchem" "welchen"
[211] "welcher" "welches" "wenn" "werde" "werden"
[216] "wie" "wieder" "will" "wir" "wird"
[221] "wirst" "wo" "wollen" "wollte" "würde"
[226] "würden" "zu" "zum" "zur" "zwar"
[231] "zwischen"