Kuromoji
A ruby wrapper for the Kuromoji Japanese morphological analyzer.
require JAVA
Installation
Add this line to your application's Gemfile:
gem 'kuromoji-ruby'
And then execute:
$ bundle
Or install it yourself as:
$ gem install kuromoji-ruby
Usage
You have to set JAVA_HOME
# .bashrc
# This is OSX example.
export JAVA_HOME=`/usr/libexec/java_home`
and to run.
Kuromoji::Core.new.tokenize("あそこにいるのチャウチャウちゃうんちゃう")
#=> {"あそこ"=>"名詞,代名詞,一般,*,*,*,あそこ,アソコ,アソコ",
"に"=>"助詞,格助詞,一般,*,*,*,に,ニ,ニ",
"いる"=>"動詞,自立,*,*,一段,基本形,いる,イル,イル",
"の"=>"助詞,連体化,*,*,*,*,の,ノ,ノ",
"チャウチャウ"=>"名詞,一般,*,*,*,*,*",
"ちゃう"=>"動詞,自立,*,*,五段・ワ行促音便,基本形,ちゃう,チャウ,チャウ",
"ん"=>"名詞,非自立,一般,*,*,*,ん,ン,ン"}
Kuromoji::Core.new.reading("吉田篤")
=> {"吉田"=>"ヨシダ", "篤"=>"アツシ"}
command line interface.
Commands:
kuromoji all_features # echo '関西国際空港は、大阪府大阪市の南西38kmにまたがる会社管理空港である' | kuromoji -a
kuromoji help [COMMAND] # Describe available commands or one specific command
kuromoji reading # echo '関西国際空港は、大阪府大阪市の南西38kmにまたがる会社管理空港である' | kuromoji -r
kuromoji tokenize # echo '関西国際空港は、大阪府大阪市の南西38kmにまたがる会社管理空港である' | kuromoji -t
Example
output json format
$ echo '関西国際空港は、大阪府大阪市の南西38kmにまたがる会社管理空港である' | kuromoji -t | jq .
#[
# {
# "is_user": false,
# "is_known": true,
# "position": 0,
# "surface_form": "関西国際空港",
# "base_form": "関西国際空港",
# "reading": "カンサイコクサイクウコウ",
# "parts_of_speech": "名詞,固有名詞,組織,*"
# },
# {
# "is_user": false,
# "is_known": true,
# "position": 6,
# "surface_form": "は",
# "base_form": "は",
# "reading": "ハ",
# "parts_of_speech": "助詞,係助詞,*,*"
# },
# {
# "is_user": false,
# "is_known": true,
# "position": 7,
# "surface_form": "、",
# "base_form": "、",
# "reading": "、",
# "parts_of_speech": "記号,読点,*,*"
# },
# {
# "is_user": false,
# "is_known": true,
# "position": 8,
# "surface_form": "大阪",
# "base_form": "大阪",
# "reading": "オオサカ",
# "parts_of_speech": "名詞,固有名詞,地域,一般"
# },
# {
# "is_user": false,
# "is_known": true,
# "position": 10,
# "surface_form": "府",
# "base_form": "府",
# "reading": "フ",
# "parts_of_speech": "名詞,接尾,地域,*"
# },
# {
# "is_user": false,
# "is_known": true,
# "position": 11,
# "surface_form": "大阪",
# "base_form": "大阪",
# "reading": "オオサカ",
# "parts_of_speech": "名詞,固有名詞,地域,一般"
# },
# {
# "is_user": false,
# "is_known": true,
# "position": 13,
# "surface_form": "市",
# "base_form": "市",
# "reading": "シ",
# "parts_of_speech": "名詞,接尾,地域,*"
# },
# {
# "is_user": false,
# "is_known": true,
# "position": 14,
# "surface_form": "の",
# "base_form": "の",
# "reading": "ノ",
# "parts_of_speech": "助詞,連体化,*,*"
# },
# {
# "is_user": false,
# "is_known": true,
# "position": 15,
# "surface_form": "南西",
# "base_form": "南西",
# "reading": "ナンセイ",
# "parts_of_speech": "名詞,一般,*,*"
# },
# {
# "is_user": false,
# "is_known": false,
# "position": 17,
# "surface_form": "38",
# "base_form": null,
# "reading": null,
# "parts_of_speech": "名詞,数,*,*"
# },
# {
# "is_user": false,
# "is_known": false,
# "position": 19,
# "surface_form": "km",
# "base_form": null,
# "reading": null,
# "parts_of_speech": "名詞,一般,*,*"
# },
# {
# "is_user": false,
# "is_known": true,
# "position": 21,
# "surface_form": "に",
# "base_form": "に",
# "reading": "ニ",
# "parts_of_speech": "助詞,格助詞,一般,*"
# },
# {
# "is_user": false,
# "is_known": true,
# "position": 22,
# "surface_form": "またがる",
# "base_form": "またがる",
# "reading": "マタガル",
# "parts_of_speech": "動詞,自立,*,*"
# },
# {
# "is_user": false,
# "is_known": true,
# "position": 26,
# "surface_form": "会社",
# "base_form": "会社",
# "reading": "カイシャ",
# "parts_of_speech": "名詞,一般,*,*"
# },
# {
# "is_user": false,
# "is_known": true,
# "position": 28,
# "surface_form": "管理",
# "base_form": "管理",
# "reading": "カンリ",
# "parts_of_speech": "名詞,サ変接続,*,*"
# },
# {
# "is_user": false,
# "is_known": true,
# "position": 30,
# "surface_form": "空港",
# "base_form": "空港",
# "reading": "クウコウ",
# "parts_of_speech": "名詞,一般,*,*"
# },
# {
# "is_user": false,
# "is_known": true,
# "position": 32,
# "surface_form": "で",
# "base_form": "だ",
# "reading": "デ",
# "parts_of_speech": "助動詞,*,*,*"
# },
# {
# "is_user": false,
# "is_known": true,
# "position": 33,
# "surface_form": "ある",
# "base_form": "ある",
# "reading": "アル",
# "parts_of_speech": "助動詞,*,*,*"
# },
# {
# "is_user": false,
# "is_known": false,
# "position": 35,
# "surface_form": "\n",
# "base_form": null,
# "reading": null,
# "parts_of_speech": "記号,空白,*,*"
# }
#]
Contributing
- Fork it ( http://github.com//kuromoji/fork )
- Create your feature branch (
git checkout -b my-new-feature
) - Commit your changes (
git commit -am 'Add some feature'
) - Push to the branch (
git push origin my-new-feature
) - Create new Pull Request