Unicode: Unicode Data and Utilities
> library(Unicode)
バージョン: 8.0.0.1
関数名 | 概略 |
---|---|
Unicode_alphabetic_tokenizer |
Unicode Alphabetic Tokenizer |
as.u_char |
Unicode Character Objects |
n_of_u_chars |
Unicode Character Counts |
u_blocks |
Unicode Blocks |
u_char_inspect |
Unicode Character Inspection |
u_char_match |
Unicode Character Matching |
u_char_name |
Unicode Character Names |
u_char_properties |
Unicode Character Properties |
u_named_sequences |
Unicode Named Sequences |
u_scripts |
Unicode Scripts |
u_to_lower_case |
Unicode Case Conversions |
u_named_sequences
> u_named_sequences() %>% head()
Name
1 LATIN CAPITAL LETTER A WITH MACRON AND GRAVE
2 LATIN SMALL LETTER A WITH MACRON AND GRAVE
3 LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW
4 LATIN SMALL LETTER E WITH VERTICAL LINE BELOW
5 LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW AND GRAVE
6 LATIN SMALL LETTER E WITH VERTICAL LINE BELOW AND GRAVE
Sequence
1 <U+0100,U+0300>
2 <U+0101,U+0300>
3 <U+0045,U+0329>
4 <U+0065,U+0329>
5 <U+00C8,U+0329>
6 <U+00E8,U+0329>
u_scripts
> scripts <- u_scripts()
> scripts %>% names()
[1] "Ahom" "Anatolian_Hieroglyphs"
[3] "Arabic" "Armenian"
[5] "Avestan" "Balinese"
[7] "Bamum" "Bassa_Vah"
[9] "Batak" "Bengali"
[11] "Bopomofo" "Brahmi"
[13] "Braille" "Buginese"
[15] "Buhid" "Canadian_Aboriginal"
[17] "Carian" "Caucasian_Albanian"
[19] "Chakma" "Cham"
[21] "Cherokee" "Common"
[23] "Coptic" "Cuneiform"
[25] "Cypriot" "Cyrillic"
[27] "Deseret" "Devanagari"
[29] "Duployan" "Egyptian_Hieroglyphs"
[31] "Elbasan" "Ethiopic"
[33] "Georgian" "Glagolitic"
[35] "Gothic" "Grantha"
[37] "Greek" "Gujarati"
[39] "Gurmukhi" "Han"
[41] "Hangul" "Hanunoo"
[43] "Hatran" "Hebrew"
[45] "Hiragana" "Imperial_Aramaic"
[47] "Inherited" "Inscriptional_Pahlavi"
[49] "Inscriptional_Parthian" "Javanese"
[51] "Kaithi" "Kannada"
[53] "Katakana" "Kayah_Li"
[55] "Kharoshthi" "Khmer"
[57] "Khojki" "Khudawadi"
[59] "Lao" "Latin"
[61] "Lepcha" "Limbu"
[63] "Linear_A" "Linear_B"
[65] "Lisu" "Lycian"
[67] "Lydian" "Mahajani"
[69] "Malayalam" "Mandaic"
[71] "Manichaean" "Meetei_Mayek"
[73] "Mende_Kikakui" "Meroitic_Cursive"
[75] "Meroitic_Hieroglyphs" "Miao"
[77] "Modi" "Mongolian"
[79] "Mro" "Multani"
[81] "Myanmar" "Nabataean"
[83] "New_Tai_Lue" "Nko"
[85] "Ogham" "Ol_Chiki"
[87] "Old_Hungarian" "Old_Italic"
[89] "Old_North_Arabian" "Old_Permic"
[91] "Old_Persian" "Old_South_Arabian"
[93] "Old_Turkic" "Oriya"
[95] "Osmanya" "Pahawh_Hmong"
[97] "Palmyrene" "Pau_Cin_Hau"
[99] "Phags_Pa" "Phoenician"
[101] "Psalter_Pahlavi" "Rejang"
[103] "Runic" "Samaritan"
[105] "Saurashtra" "Sharada"
[107] "Shavian" "Siddham"
[109] "SignWriting" "Sinhala"
[111] "Sora_Sompeng" "Sundanese"
[113] "Syloti_Nagri" "Syriac"
[115] "Tagalog" "Tagbanwa"
[117] "Tai_Le" "Tai_Tham"
[119] "Tai_Viet" "Takri"
[121] "Tamil" "Telugu"
[123] "Thaana" "Thai"
[125] "Tibetan" "Tifinagh"
[127] "Tirhuta" "Ugaritic"
[129] "Vai" "Warang_Citi"
[131] "Yi"
u_to_lower_case / u_to_upper_case / u_to_title_case / u_case_fold
> u_to_upper_case("heiß")
[1] "HEISS"
> u_case_fold("heiß")
[1] "heiss"