{"id":459,"date":"2020-10-16T14:19:58","date_gmt":"2020-10-16T05:19:58","guid":{"rendered":"http:\/\/cedartrees.co.kr\/?p=459"},"modified":"2021-04-03T19:11:57","modified_gmt":"2021-04-03T10:11:57","slug":"word2vec-test","status":"publish","type":"post","link":"http:\/\/blog.cedartrees.co.kr\/index.php\/2020\/10\/16\/word2vec-test\/","title":{"rendered":"Word2Vec \uc2dc\uac01\ud654"},"content":{"rendered":"\n<p>Word2Vec\uc740 \uac04\ub2e8\uc744 \uac04\ub2e8\ud788 \ub9d0\ud558\uba74 &#8220;\ubb38\uc7a5\uc548\uc5d0 \uc788\ub294 \uc5ec\ub7ec \ub2e8\uc5b4\ub4e4\uc744 \ubca1\ud130 \ud615\ud0dc\ub85c \ud45c\ud604\ud558\ub294 \uac83&#8221; \ub9d0 \uadf8\ub300\ub85c Word to Vector\ub77c\uace0 \ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4. \uc6cc\ub4dc\ub77c\ub294 \ub9d0\uc740 \uc27d\uac8c \uc774\ud574\ud560 \uc218 \uc788\uc9c0\ub9cc \ubca1\ud130(Vector)\ub294 \uc5b4\ub5a4 \ub73b\uc77c\uae4c\uc694?<\/p>\n\n\n\n<p>\ubb3c\ub9ac\ud559\uc774\ub098 \uc218\ud559\uc5d0\uc11c \uc57d\uac04\uc529 \ucc28\uc774\uac00 \uc788\uc9c0\ub9cc \uacf5\ud1b5\uc801\uc73c\ub85c \uc5b4\ub5a4 \uacf5\uac04\uc5d0\uc11c \uc704\uce58\uc640 \ubc29\ud5a5\uc131\uc744 \uac00\uc9c0\ub294 \uac12\uc744 \ud45c\ud604\ud558\ub294 \uac83\uc774\ub77c\uace0 \ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4. \uadf8\ub7ec\ub2c8\uae4c \ubb38\uc7a5\uc5d0 \ub9ce\uc740 Word\ub97c \uc5b4\ub5a4 \uacf5\uac04\uc5d0 \uc704\uce58\uac12\uc744 \ud45c\uc2dc\ud560 \ubfd0\ub9cc\uc544\ub2c8\ub77c \uc774 \uac12\ub4e4\uc774 \uc5b4\ub5a4 \ubc29\ud5a5\uc131\uc774 \uc788\ub294\uc9c0\ub97c \ud45c\uc2dc\ud558\ub294 \uae30\ubc95\uc774 Word2Vec\uc774\ub77c\uace0 \ud558\uaca0\uc2b5\ub2c8\ub2e4.<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img src=\"https:\/\/shuuki4.files.wordpress.com\/2016\/01\/vector.png\" alt=\"\"\/><\/figure>\n\n\n\n<p>\uc704\uc758 \uc774\ubbf8\uc9c0\ub294 word2vec\uc758 \uac00\uc7a5 \uc720\uba85\ud55c \uadf8\ub9bc \uc911\uc5d0 \ud558\ub098\uc785\ub2c8\ub2e4. \uac01 \ub2e8\uc5b4\ub4e4\uc744 \ubcf4\uba74 \uc5b4\ub5a4 \ubc29\ud5a5\uc131\uc774 \uc788\uace0 \uc22b\uc790 \uac12\uc744 \uac00\uc9c0\uace0 \uc788\uc2b5\ub2c8\ub2e4. \uadf8\ub807\uae30 \ub54c\ubb38\uc5d0 \uc720\uc0ac\ub3c4\ub97c \uacc4\uc0b0 \ud560 \uc218\ub3c4 \uc788\uace0 \uac01 \ub2e8\uc5b4\uc758 \uad00\uacc4\uc5d0 \ub300\ud55c \uc5f0\uc0b0\uc774 \uac00\ub2a5\ud569\ub2c8\ub2e4.<br>\uc608\ub97c \ub4e4\uc5b4\uc11c &#8220;KING-MAN+WOMAN=QUEEN&#8221;\uc774\ub77c\ub294 \uad00\uacc4\uac00 \ub098\uc628\ub2e4\ub294 \uac83\uc774\uc8e0.<br>\ub610 &#8220;\ud55c\uad6d-\uc11c\uc6b8+\ub3c4\ucfc4=\uc77c\ubcf8&#8221;\ub77c\ub294 \uad00\uacc4\ub97c \ucd94\ucd9c\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4. \uc544\ub798 \ub9c1\ud06c\ub97c \ubc29\ubb38\ud574\ubcf4\uc2dc\uace0 \ub2e4\uc591\ud55c \ucf00\uc774\uc2a4\ub97c \ud14c\uc2a4\ud2b8\ud574\ubcf4\uc2dc\uae30 \ubc14\ub78d\ub2c8\ub2e4.<br><a href=\"https:\/\/word2vec.kr\/search\/\">https:\/\/word2vec.kr\/search\/<\/a><\/p>\n\n\n\n<p>\uc774\uac83\uc740 \uc804\ud1b5\uc801\uc778 \ubc29\ubc95\uc778 One-Hot-Encoding\uc744 \ud1b5\ud574\uc11c \ub2e8\uc5b4\ub97c \ud45c\ud604\ud558\ub294 \uac83\uc758 \ubb38\uc81c\uc810\uc744 \uadf9\ubcf5\ud560 \uc218 \uc788\ub294 \uc544\uc8fc \uc720\uc6a9\ud55c \ubc29\ubc95\uc785\ub2c8\ub2e4. \uc774\ub807\uac8c \ub2e8\uc5b4\ub4e4\uc744 \ubca1\ud130\ub85c \ubc14\uafb8\ub294 \uac83\uc744 \uc6cc\ub4dc \uc784\ubca0\ub529(Word-Embedding)\uc774\ub77c\uace0 \ud558\uace0 \uadf8\uc911\uc5d0\uc11c \uac00\uc7a5 \ub300\ud45c\uc801\uc778 \ubaa8\ub378\uc774 Word2Vec \ubaa8\ub378\ub85c \ud574\ub2f9 \ub2e8\uc5b4\uc640 \ud568\uaed8 \uc790\uc8fc \ub4f1\uc7a5\ud558\ub294 \ub2e8\uc5b4\ub294 \ube44\uc2b7\ud55c \ub2e8\uc5b4\uc77c\uac83\uc774\ub77c\ub294 \uac00\uc815\uc73c\ub85c \ucd9c\ubc1c\ud569\ub2c8\ub2e4.<\/p>\n\n\n\n<p>\ubcf8 \uc608\uc81c\ub294 Word2Vec\uc758 \uc6d0\ub9ac\uc640 \uc774\ub860\uc744 \uc18c\uac1c\ud558\ub294 \uac83\uc740 \uc544\ub2c8\uace0 \uc2e4\uc81c\ub85c \ub2e8\uc5b4\ub97c 2\ucc28\uc6d0 \uacf5\uac04\uc5d0 \ud45c\uc2dc\ud558\ub294 \ubc29\ubc95\uc5d0 \ub300\ud55c \uc608\uc81c\ucf54\ub4dc\uc774\uae30 \ub54c\ubb38\uc5d0 \ud574\ub2f9 \uc774\ub860\uc774 \uad81\uae08\ud558\uc2e0 \ubd84\ub4e4\uc740 \uc778\ud130\ub137\uc5d0 \uacf5\uac1c\ub41c \ub9ce\uc740 \uc608\uc81c\ub4e4\uc774 \uc788\uc73c\ub2c8 \ucc38\uace0\ud574\ubcf4\uc2dc\uae30 \ubc14\ub78d\ub2c8\ub2e4.<\/p>\n\n\n\n<p>\uc608\uc81c\ub97c \uc2e4\ud574\ud558\uae30 \uc704\ud574\uc11c \uba3c\uc800 \ud544\uc694\ud55c \ub77c\uc774\ube0c\ub7ec\ub9ac\ub97c import\ud569\ub2c8\ub2e4. <br>\ubd84\uc11d\ud560 \ub370\uc774\ud130\ub294 \uc778\ud130\ub137 \uc1fc\ud551\ubab0\uc758 \ub9c8\uc6b0\uc2a4\ub97c \uad6c\ub9e4\ud55c \ud6c4\uc5d0 \ub0a8\uae34 \ud6c4\uae30\ub4e4\uc744 \ubaa8\uc740 \uac83\uc785\ub2c8\ub2e4. \uc608\ub97c \ub4e4\uc5b4 \uc81c\ud488\uc758 \uc774\ub984\uc744 \uc120\ud0dd\ud588\uc744 \uacbd\uc6b0\uc5d0 \ud574\ub2f9 \ub2e8\uc5b4\uc640 \uac00\uc7a5 \uac70\ub9ac\uac00 \uac00\uae4c\uc6b4 \ub2e8\uc5b4\ub4e4\uc774 \uae0d\uc815\uc758 \ub2e8\uc5b4\ub4e4\uc774\ub77c\uba74 \uc81c\ud488\uc758 \ud3c9\uac00\uac00 \uc88b\uc744 \uac83\uc77c\ud14c\uace0 \ubc18\ub300\ub85c \uc81c\ud488\uc774 \ubd80\uc815\uc801\uc778 \ub2e8\uc5b4\ub4e4\uacfc \uac70\ub9ac\uac00 \uac00\uae5d\ub2e4\uba74 \ubc18\ub300\uc758 \uacbd\uc6b0\ub77c\uace0 \uc0dd\uac01\ud560 \uc218 \uc788\uaca0\uc2b5\ub2c8\ub2e4. <\/p>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"generic\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\">import pandas as pd\nimport numpy as np\n\ndf_r = pd.read_excel(\".\/mouse_review.xlsx\")\ndf_r.head()<\/pre>\n\n\n\n<p>\ud30c\uc77c\uc744 \uc77d\uc5b4\uc628 \ub4a4\uc5d0 pandas\uc758 head() \ud568\uc218\ub85c \uc0c1\uc704 5\uac1c\uc758 \ub370\uc774\ud130\ub97c \ucd94\ucd9c\ud574\ubd05\ub2c8\ub2e4.<br>\ub370\uc774\ud130\ub294 \uc0ac\uc6a9\uc790, \uc791\uc131\uc77c, \ub9ac\ubdf0 \ub0b4\uc6a9, \ubcc4\uc810, \uc81c\ud488\uba85 \uc815\ubcf4\uac00 \uc788\uc2b5\ub2c8\ub2e4.<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"899\" height=\"168\" src=\"http:\/\/cedartrees.co.kr\/wp-content\/uploads\/2020\/10\/\u1109\u1173\u110f\u1173\u1105\u1175\u11ab\u1109\u1163\u11ba-2020-10-16-\u110b\u1169\u1112\u116e-2.16.19.png\" alt=\"\" class=\"wp-image-464\" srcset=\"http:\/\/blog.cedartrees.co.kr\/wp-content\/uploads\/2020\/10\/\u1109\u1173\u110f\u1173\u1105\u1175\u11ab\u1109\u1163\u11ba-2020-10-16-\u110b\u1169\u1112\u116e-2.16.19.png 899w, http:\/\/blog.cedartrees.co.kr\/wp-content\/uploads\/2020\/10\/\u1109\u1173\u110f\u1173\u1105\u1175\u11ab\u1109\u1163\u11ba-2020-10-16-\u110b\u1169\u1112\u116e-2.16.19-300x56.png 300w, http:\/\/blog.cedartrees.co.kr\/wp-content\/uploads\/2020\/10\/\u1109\u1173\u110f\u1173\u1105\u1175\u11ab\u1109\u1163\u11ba-2020-10-16-\u110b\u1169\u1112\u116e-2.16.19-768x144.png 768w\" sizes=\"(max-width: 706px) 89vw, (max-width: 767px) 82vw, 740px\" \/><\/figure>\n\n\n\n<p>\uc774\ubc88\uc5d0 \uc0ac\uc6a9\ud560 text \uc815\ubcf4\ub294 \ub9ac\ubdf0 \ub0b4\uc6a9\uc785\ub2c8\ub2e4. \ub9ac\ubdf0\uc5d0 \ubcf4\uba74 \uc5ec\ub7ec\uac00\uc9c0 \ud2b9\uc218\uae30\ud638, \uc601\ubb38\uc790 \ub4f1\uc774 \uc788\uae30 \ub54c\ubb38\uc5d0 \uc815\uaddc\uc2dd\uc744 \ud1b5\ud574\uc11c \ud55c\uae00 \uc678\uc5d0 \ub098\uba38\uc9c0 \ub370\uc774\ud130\ub97c \uac78\ub7ec\ub0c5\ub2c8\ub2e4. \uac78\ub7ec\ub0b8 \ub370\uc774\ud130\ub294 review_train \uceec\ub7fc\uc744 \ub9cc\ub4e4\uc5b4\uc11c \uc6d0\ubcf8 \ub370\uc774\ud130\uc640 \ubcc4\ub3c4\ub85c \uc800\uc7a5\ud574\ub461\ub2c8\ub2e4.<\/p>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"generic\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\">df_r['review_train'] = df_r['review'].str.replace(\"[^\u3131-\u314e\u314f-\u3163\uac00-\ud7a3 ]\",\"\")\ndf_r.head()<\/pre>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"913\" height=\"245\" src=\"http:\/\/cedartrees.co.kr\/wp-content\/uploads\/2020\/10\/\u1109\u1173\u110f\u1173\u1105\u1175\u11ab\u1109\u1163\u11ba-2020-10-16-\u110b\u1169\u1112\u116e-2.16.28.png\" alt=\"\" class=\"wp-image-465\" srcset=\"http:\/\/blog.cedartrees.co.kr\/wp-content\/uploads\/2020\/10\/\u1109\u1173\u110f\u1173\u1105\u1175\u11ab\u1109\u1163\u11ba-2020-10-16-\u110b\u1169\u1112\u116e-2.16.28.png 913w, http:\/\/blog.cedartrees.co.kr\/wp-content\/uploads\/2020\/10\/\u1109\u1173\u110f\u1173\u1105\u1175\u11ab\u1109\u1163\u11ba-2020-10-16-\u110b\u1169\u1112\u116e-2.16.28-300x81.png 300w, http:\/\/blog.cedartrees.co.kr\/wp-content\/uploads\/2020\/10\/\u1109\u1173\u110f\u1173\u1105\u1175\u11ab\u1109\u1163\u11ba-2020-10-16-\u110b\u1169\u1112\u116e-2.16.28-768x206.png 768w\" sizes=\"(max-width: 706px) 89vw, (max-width: 767px) 82vw, 740px\" \/><\/figure>\n\n\n\n<p>1\ucc28\ub85c \uc815\uaddc\ud654\ub97c \ub05d\ub0b8 \ud14d\uc2a4\ud2b8 \ub370\uc774\ud130\ub97c \ud1b5\ud574\uc11c \ubb38\uc7a5\uc744 \ud615\ud0dc\uc18c\ubcc4\ub85c \ubd84\ub9ac\ud574\uc90d\ub2c8\ub2e4. \ub610 \uc0ac\uc6a9\ud558\uc9c0 \uc54a\ub294 \ub2e8\uc5b4\ub4e4\uc758 \uc0ac\uc804\uc744 \ubaa8\uc544\uc11c \ubd88\uc6a9\ub2e8\uc5b4\ub97c \uac78\ub7ec\ub0c5\ub2c8\ub2e4. \ubd84\uc11d\ud558\uace0\uc790 \ud558\ub294 \uc0c1\ud669\uc5d0 \ub9de\ucdb0\uc11c \ubd88\uc6a9\uc5b4\ub97c \ub4f1\ub85d\ud574\uc90d\ub2c8\ub2e4.<\/p>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"generic\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\">from konlpy.tag import Okt\n\nstop_words = ['\uac00','\uc694','\ubcc0','\uc744','\uc218','\uc5d0','\ubb38','\uc81c','\ub97c','\uc774','\ub3c4','\uc740','\ub2e4','\uac8c','\uc694','\ud55c','\uc77c','\ud560','\uc778\ub370','\uac70','\uc880','\ub294\ub370','\u314e\u314e','\ubb50','\uae4c','\uc788\ub294','\uc798','\uc2b5\ub2c8\ub2e4','\ub2e4\uba74','\ud588','\uc8fc\ub824','\uc9c0','\uc788','\ubabb','\ud6c4','\uc911','\uc904']\n\nokt = Okt()\ntokenized_data = []\nfor sentence in df_r['review_train']:\n    temp_X = okt.morphs(sentence, stem=True) # \ud1a0\ud070\ud654\n    temp_X = [word for word in temp_X if not word in stop_words] \n    tokenized_data.append(temp_X)<\/pre>\n\n\n\n<p>\uc774\uc81c \uc2dc\uac01\ud654\ub97c \uc704\ud55c \uc900\ube44\ub97c \ud574\uc90d\ub2c8\ub2e4. \uc2dc\uac01\ud654\ub294 matplolib\uc744 \uc0ac\uc6a9\ud569\ub2c8\ub2e4.<br>\ud55c\uae00\ud654\ub97c \uc704\ud574\uc11c \ud3f0\ud2b8\ub97c \uc124\uc815\ud574\uc90d\ub2c8\ub2e4.<br>\ubcf8 \uc608\uc81c\ub294 Mac OS\ud658\uacbd\uc5d0\uc11c \ud14c\uc2a4\ud2b8 \ub418\uc5c8\uae30 \ub54c\ubb38\uc5d0 \ud3f0\ud2b8\uc758 \uc704\uce58\ub294 Window \uc0ac\uc6a9\uc790\uc640 \ud2c0\ub9b4 \uc218 \uc788\uc73c\ub2c8 \ud14c\uc2a4\ud2b8 \ud658\uacbd\uc5d0 \ub9de\uac8c \ud3f0\ud2b8 \uc815\ubcf4\ub97c \ubcc0\uacbd\ud574\uc90d\ub2c8\ub2e4.<\/p>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"generic\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\">import matplotlib.pyplot as plt\nfrom matplotlib import font_manager, rc\nfont_name = font_manager.FontProperties(fname='\/System\/Library\/Fonts\/Supplemental\/AppleGothic.ttf').get_name()\nrc('font', family=font_name)<\/pre>\n\n\n\n<p>\ub9ac\ubdf0 \ud14d\uc2a4\ud2b8\uc758 \uc815\ubcf4\ub4e4\uc744 \uac04\ub2e8\ud788 \ud45c\uc2dc\ud574\uc90d\ub2c8\ub2e4. <br>\ud559\uc2b5\uc5d0 \ud544\uc694\ud55c \ub2e8\uacc4\ub294 \uc544\ub2c8\ub2c8 \ub370\uc774\ud130\uc5d0 \ub300\ud55c \uc815\ubcf4\ub97c \ubcf4\uace0\uc790 \ud558\uc9c0 \uc54a\ub294\ub2e4\uba74 \uadf8\ub0e5 \ub118\uc5b4\uac00\uc154\ub3c4 \ub418\uaca0\uc2b5\ub2c8\ub2e4. \ubcf8 \uc608\uc81c\uc5d0 \uc0ac\uc6a9\ub41c \ub370\uc774\ud130\ub294 \ub300\ubd80\ubd84 \uae38\uc774\uac00 0~50\uae00\uc790 \uc0ac\uc774\uc758 \ube44\uad50\uc801 \uc9e7\uc740 \ubb38\uc7a5\ub4e4\uc774\ub77c\ub294 \uac83\uc744 \uc54c \uc218 \uc788\uc2b5\ub2c8\ub2e4.<\/p>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"generic\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\">print('\ub9ac\ubdf0\uc758 \ucd5c\ub300 \uae38\uc774 :',max(len(l) for l in tokenized_data))\nprint('\ub9ac\ubdf0\uc758 \ud3c9\uade0 \uae38\uc774 :',sum(map(len, tokenized_data))\/len(tokenized_data))\nplt.hist([len(s) for s in tokenized_data], bins=50)\nplt.xlabel('length of samples')\nplt.ylabel('number of samples')\nplt.show()<\/pre>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"395\" height=\"268\" src=\"http:\/\/cedartrees.co.kr\/wp-content\/uploads\/2020\/10\/\u1109\u1173\u110f\u1173\u1105\u1175\u11ab\u1109\u1163\u11ba-2020-10-16-\u110b\u1169\u1112\u116e-2.18.05.png\" alt=\"\" class=\"wp-image-466\" srcset=\"http:\/\/blog.cedartrees.co.kr\/wp-content\/uploads\/2020\/10\/\u1109\u1173\u110f\u1173\u1105\u1175\u11ab\u1109\u1163\u11ba-2020-10-16-\u110b\u1169\u1112\u116e-2.18.05.png 395w, http:\/\/blog.cedartrees.co.kr\/wp-content\/uploads\/2020\/10\/\u1109\u1173\u110f\u1173\u1105\u1175\u11ab\u1109\u1163\u11ba-2020-10-16-\u110b\u1169\u1112\u116e-2.18.05-300x204.png 300w\" sizes=\"(max-width: 395px) 100vw, 395px\" \/><\/figure>\n\n\n\n<p>\uc774\uc81c Word2Vec \ubaa8\ub378\uc744 \uc0dd\uc131\ud560 \ucc28\ub840\uc785\ub2c8\ub2e4.<br>Word2Vec \ubaa8\ub378\uc740 \uac00\uc7a5 \uc798 \uc54c\ub824\uc9c4 gensim \ub77c\uc774\ube0c\ub7ec\ub9ac\ub97c \ud65c\uc6a9\ud574\ubcf4\uaca0\uc2b5\ub2c8\ub2e4.<\/p>\n\n\n\n<p>\uc0ac\uc6a9\ud55c \ud30c\ub77c\uba54\ud130\uc758 \uc790\uc138\ud55c \uc815\ubcf4\ub294 \uc544\ub798 \ub9c1\ud06c\ub97c \ucc38\uc870\ud574\ubcf4\uc2dc\uae30 \ubc14\ub78d\ub2c8\ub2e4.<br>\ubcf8 \ubaa8\ub378\uc740 \uc88c\uc6b0 5\uac1c\uc758 \ub2e8\uc5b4\ub97c \ucc38\uc870\ud558\ub294 100\ucc28\uc6d0\uc758 \uc6cc\ub4dc \ubca1\ud130\ub97c \ub9cc\ub4dc\ub294 \ubaa8\ub378\ub85c cobow \uc54c\uace0\ub9ac\uc998\uc744 \uc0ac\uc6a9\ud558\uace0 \ucd5c\uc18c 5\ubc88 \uc774\ud558\ub85c \ub4f1\uc7a5\ud558\ub294 \ub2e8\uc5b4\ub4e4\uc740 \uc81c\uc678\ud558\uaca0\uc2b5\ub2c8\ub2e4. worker\ub294 thread\uc758 \uac2f\uc218\ub85c \ud14c\uc2a4\ud2b8\ud558\ub294 \ud558\ub4dc\uc6e8\uc5b4\uc758 \uc131\ub2a5\uc5d0 \ub530\ub77c\uc11c \uc870\uc815\ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4.<br><a href=\"https:\/\/radimrehurek.com\/gensim\/models\/word2vec.html\">https:\/\/radimrehurek.com\/gensim\/models\/word2vec.html<\/a><\/p>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"generic\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\">from gensim.models import Word2Vec\nmodel = Word2Vec(sentences = tokenized_data, size = 100, window = 5, min_count = 5, workers = 4, sg = 0)<\/pre>\n\n\n\n<p>\ud559\uc2b5 \ub370\uc774\ud130\uac00 \ub9ce\uc9c0 \uc54a\uae30 \ub300\ubb38\uc5d0 \ud559\uc2b5 \uc2dc\uac04\uc740 \uc624\ub798 \uac78\ub9ac\uc9c0 \uc54a\uc2b5\ub2c8\ub2e4.<br>\ud559\uc2b5\uc774 \ub05d\ub09c \ud6c4\uc5d0 \ub2e8\uc5b4\ub4e4\uc744 \ucd94\ucd9c\ud574\uc11c \ubca1\ud130 \ub9ac\uc2a4\ud2b8\ub97c \uc0dd\uc131\ud569\ub2c8\ub2e4.  \ud574\ub2f9 \ub9ac\uc2a4\ud2b8 \ud558\ub098\ub97c \ucd9c\ub825\ud574\ubcf4\uba74 \uc544\ub798\uc640 \uac19\uc740 \ub370\uc774\ud130\uac00 \ud45c\uc2dc\ub429\ub2c8\ub2e4.<\/p>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"generic\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\">vocabs = model.wv.vocab.keys()\nword_vocab_list = [model.wv[v] for v in vocabs]<\/pre>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"generic\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\">array([ 0.45729467, -0.45482287,  0.2776271 , -0.38435346,  0.4311736 ,\n       -0.36617622,  0.12129851, -0.309033  , -0.09569103, -0.27311006,\n        0.28018764, -0.13276236,  0.13590969,  0.0521839 , -0.01882668,\n        0.13234554, -0.02577238,  0.43111804, -0.6007069 ,  0.52846146,\n        0.01065135, -0.20410554,  0.08504212, -0.5189065 ,  0.06219423,\n       -0.10900757,  0.19578645, -0.01295294, -0.20757432, -0.17270625,\n        0.08728364,  0.4751571 , -0.06208701, -0.3829262 ,  0.4810491 ,\n       -0.27205822, -0.16547562, -0.2804698 ,  0.1357591 ,  0.16740464,\n        0.53618526, -0.17420012,  0.06363445,  0.655636  ,  0.05952126,\n       -0.6312642 ,  0.11448789, -0.00824977, -0.26018238, -0.33553734,\n        0.18489622,  0.03913857, -0.5856825 , -0.08111028,  0.6696569 ,\n        0.4201213 , -0.2061224 , -0.03785964, -0.0813726 ,  0.0297378 ,\n       -0.5556496 , -0.0006753 ,  0.25876167,  0.08983239, -0.10351149,\n        0.24005203,  0.21328437,  0.0797505 , -0.23059952, -0.32846287,\n       -0.0017608 ,  0.51077896,  0.36693272,  0.2767188 , -0.47870687,\n       -0.3036568 , -0.06708886, -0.4789917 , -0.08152916,  0.19817959,\n        0.07031752, -0.34857494,  0.5963662 ,  0.02050934,  0.29983994,\n        0.07854129,  0.40096822,  0.00098353, -0.26964054, -0.12954848,\n        0.33181033, -0.07866482,  0.40206903, -0.37808138, -0.10669091,\n       -0.15223539, -0.01180514, -0.13499472,  0.31345636,  0.08265099],\n      dtype=float32)<\/pre>\n\n\n\n<p>Word2Vec\uc5d0\uc11c \uc81c\uacf5\ud558\ub294 \ud568\uc218\uc778 most_similar()\ub97c \ud1b5\ud574\uc11c \uc785\ub825\ud558\ub294 \ub2e8\uc5b4\uc640 \uac00\uc7a5 \uac00\uae4c\uc6b4 \ub2e8\uc5b4 \uc815\ubcf4\ub97c \ud45c\uc2dc\ud574\ubd05\ub2c8\ub2e4. &#8220;\ud074\ub9ad&#8221;\uacfc \uac00\uc7a5 \uac00\uae4c\uc6b4 \ub2e8\uc5b4\ub294 &#8220;\ubc84\ud2bc&#8221;,&#8221;\uc18c\ub9ac&#8221; \ub4f1\uc758 \uc21c\uc11c\ub85c \uac01 \ub2e8\uc5b4\uac04\uc758 \uc5f0\uad00\uc131\uc774 \ub9e4\uc6b0 \ub192\ub2e4\ub294 \uac83\uc744 \uc54c \uc218 \uc788\uc2b5\ub2c8\ub2e4. <br><br>\uc544\ub9c8\ub3c4 \ub9c8\uc6b0\uc2a4\ub77c\ub294 \uc81c\ud488\uc758 \ud2b9\uc9d5\uc0c1 \ud074\ub9ad\uc774\ub77c\ub294 \ub2e8\uc5b4\uc640 \ud568\uaed8 \ubc84\ud2bc, \uc18c\ub9ac, \uac8c\uc784, \ub290\ub08c \ub4f1\uc758 \ub2e8\uc5b4\uac00 \ub9ce\uc774 \ub4f1\uc7a5\ud588\ub2e4\ub294 \uac83\uc744 \uc54c \uc218 \uc788\uc2b5\ub2c8\ub2e4.<\/p>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"generic\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\">print(model.wv.most_similar(\"\ud074\ub9ad\"))<\/pre>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"generic\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\">[('\ubc84\ud2bc', 0.9999016523361206), ('\uc18c\ub9ac', 0.9998948574066162), ('\uac8c\uc784', 0.9998838305473328), ('\ub290\ub08c', 0.9998810291290283), ('\uc544\ub2c8\ub2e4', 0.9998774528503418), ('\ub418\ub2e4', 0.9998745918273926), ('\uc774\ub098', 0.9998740553855896), ('\ub9cc', 0.9998738765716553), ('\uc7ac\uc9c8', 0.9998738169670105), ('\ub204\ub974\ub2e4', 0.9998728036880493)]<\/pre>\n\n\n\n<p>\uc774\uc81c \uac01 \ub2e8\uc5b4\uc640\uc758 \uad00\uacc4\ub97c \uadf8\ub798\ud504\ub85c \ub098\ud0c0\ub0b4\ubcf4\uaca0\uc2b5\ub2c8\ub2e4. \ud574\ub2f9 \ub370\uc774\ud130\ub294 100\ucc28\uc6d0\uc758 \ub370\uc774\ud130\uc774\uace0 \uadf8\ub824\ubcf4\uace0\uc790 \ud558\ub294 \uac83\uc740 2\ucc28\uc6d0\uc5d0 \ud45c\uc2dc\ub418\ub294 \uadf8\ub798\ud504\uc774\uae30 \ub54c\ubb38\uc5d0 \ucc28\uc6d0\uc744 \ucd95\uc18c\ud560 \ud544\uc694\uac00 \uc788\uc2b5\ub2c8\ub2e4.<\/p>\n\n\n\n<p>\uc798\uc54c\ub824\uc9c4 \ucc28\uc6d0\ucd95\uc18c \uc54c\uace0\ub9ac\uc998\uc73c\ub85c PCA\uae30\ubc95\uc774 \uc788\uc2b5\ub2c8\ub2e4.<\/p>\n\n\n\n<p>PCA(Principal Component Analysis)\ub294 \ucc28\uc6d0\ucd95\uc18c(dimensionality reduction)\uc640\u00a0\ubcc0\uc218\ucd94\ucd9c(feature extraction)\u00a0\uae30\ubc95\uc73c\ub85c \ub110\ub9ac \uc4f0\uc774\uace0 \uc788\ub294 \uae30\ubc95\uc73c\ub85c \ub370\uc774\ud130\uc758\u00a0\ubd84\uc0b0(variance)\uc744 \ucd5c\ub300\ud55c \ubcf4\uc874\ud558\uba74\uc11c \uc11c\ub85c \uc9c1\uad50\ud558\ub294 \uc0c8 \uae30\uc800(\ucd95)\ub97c \ucc3e\uc544, \uace0\ucc28\uc6d0 \uacf5\uac04\uc758 \ud45c\ubcf8\ub4e4\uc744 \uc120\ud615 \uc5f0\uad00\uc131\uc774 \uc5c6\ub294 \uc800\ucc28\uc6d0 \uacf5\uac04\uc73c\ub85c \ubcc0\ud658\ud558\ub294 \uae30\ubc95\uc785\ub2c8\ub2e4.\u00a0<br><a href=\"https:\/\/ratsgo.github.io\/machine%20learning\/2017\/04\/24\/PCA\/\">https:\/\/ratsgo.github.io\/machine%20learning\/2017\/04\/24\/PCA\/<\/a><\/p>\n\n\n\n<div class=\"wp-block-image\"><figure class=\"aligncenter size-large\"><img src=\"http:\/\/i.imgur.com\/Uv2dlsH.gif\" alt=\"\"\/><\/figure><\/div>\n\n\n\n<pre class=\"EnlighterJSRAW\" data-enlighter-language=\"generic\" data-enlighter-theme=\"\" data-enlighter-highlight=\"\" data-enlighter-linenumbers=\"\" data-enlighter-lineoffset=\"\" data-enlighter-title=\"\" data-enlighter-group=\"\">from sklearn.decomposition import PCA\npca = PCA(n_components=2)\nxys = pca.fit_transform(word_vocab_list)\nxs = xys[:,0]\nys = xys[:,1]\n\n#plt.figure(figsize=(10 ,10))\nplt.scatter(xs, ys, marker = 'o')\nplt.xlim(0,1), plt.ylim(0,0.01)\nfor i, v in enumerate(vocabs):\n    plt.annotate(v, xy=(xs[i], ys[i]))<\/pre>\n\n\n\n<p>\ud574\ub2f9 \uae30\ubc95\uc744 \ud1b5\ud574\uc11c \uc544\ub798\uc640 \uac19\uc740 \uadf8\ub798\ud504\ub97c \uadf8\ub838\uc2b5\ub2c8\ub2e4. \ud574\ub2f9 \uadf8\ub798\ud504\ub294 \uc804\uccb4 \uadf8\ub798\ud504\uc5d0\uc11c \uc77c\ubd80 \uad6c\uac04(xlim, ylim)\uc744 \ud45c\uc2dc\ud55c \uac83\uc73c\ub85c \uc804\uccb4 \ub370\uc774\ud130\ub294 \uc544\ub2d9\ub2c8\ub2e4.<\/p>\n\n\n\n<p>\ub192\uc740 \ucc28\uc6d0\uc758 \ub370\uc774\ud130\ub97c \ud3c9\uba74\uc73c\ub85c \ucd95\uc18c\ud558\uba74\uc11c \ub370\uc774\ud130\uc758 \uad6c\uac04\uc774 \ub9ce\uc774 \uacb9\uce58\ub294 \uac83\uc744 \uc54c \uc218 \uc788\uc2b5\ub2c8\ub2e4. \uc774\ub7ec\ud55c \ubb38\uc81c\ub294 \ub370\uc774\ud130\ub97c \ub354 \ub192\uc740 \ucc28\uc6d0\uc758 \uacf5\uac04(3\ucc28\uc6d0)\uc5d0 \ud45c\uc2dc\ud55c\ub2e4\ub358\uac00 \uc544\ub2c8\uba74 \uc758\ubbf8 \uc5c6\ub294 \ub370\uc774\ud130\ub4e4\uc744 \ucd94\ucd9c\ud574\uc11c \ub370\uc774\ud130\uc758 \uc218\ub97c \uc904\uc5ec\uc11c \ud45c\uc2dc\ud560 \uc218\ub3c4 \uc788\uc2b5\ub2c8\ub2e4.<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" width=\"599\" height=\"393\" src=\"http:\/\/cedartrees.co.kr\/wp-content\/uploads\/2020\/10\/\u1109\u1173\u110f\u1173\u1105\u1175\u11ab\u1109\u1163\u11ba-2020-10-16-\u110b\u1169\u1112\u116e-2.19.21.png\" alt=\"\" class=\"wp-image-467\" srcset=\"http:\/\/blog.cedartrees.co.kr\/wp-content\/uploads\/2020\/10\/\u1109\u1173\u110f\u1173\u1105\u1175\u11ab\u1109\u1163\u11ba-2020-10-16-\u110b\u1169\u1112\u116e-2.19.21.png 599w, http:\/\/blog.cedartrees.co.kr\/wp-content\/uploads\/2020\/10\/\u1109\u1173\u110f\u1173\u1105\u1175\u11ab\u1109\u1163\u11ba-2020-10-16-\u110b\u1169\u1112\u116e-2.19.21-300x197.png 300w\" sizes=\"(max-width: 599px) 100vw, 599px\" \/><\/figure>\n","protected":false},"excerpt":{"rendered":"<p>Word2Vec\uc740 \uac04\ub2e8\uc744 \uac04\ub2e8\ud788 \ub9d0\ud558\uba74 &#8220;\ubb38\uc7a5\uc548\uc5d0 \uc788\ub294 \uc5ec\ub7ec \ub2e8\uc5b4\ub4e4\uc744 \ubca1\ud130 \ud615\ud0dc\ub85c \ud45c\ud604\ud558\ub294 \uac83&#8221; \ub9d0 \uadf8\ub300\ub85c Word to Vector\ub77c\uace0 \ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4. \uc6cc\ub4dc\ub77c\ub294 \ub9d0\uc740 \uc27d\uac8c \uc774\ud574\ud560 \uc218 \uc788\uc9c0\ub9cc \ubca1\ud130(Vector)\ub294 \uc5b4\ub5a4 \ub73b\uc77c\uae4c\uc694? \ubb3c\ub9ac\ud559\uc774\ub098 \uc218\ud559\uc5d0\uc11c \uc57d\uac04\uc529 \ucc28\uc774\uac00 \uc788\uc9c0\ub9cc \uacf5\ud1b5\uc801\uc73c\ub85c \uc5b4\ub5a4 \uacf5\uac04\uc5d0\uc11c \uc704\uce58\uc640 \ubc29\ud5a5\uc131\uc744 \uac00\uc9c0\ub294 \uac12\uc744 \ud45c\ud604\ud558\ub294 \uac83\uc774\ub77c\uace0 \ud560 \uc218 \uc788\uc2b5\ub2c8\ub2e4. \uadf8\ub7ec\ub2c8\uae4c \ubb38\uc7a5\uc5d0 \ub9ce\uc740 Word\ub97c \uc5b4\ub5a4 \uacf5\uac04\uc5d0 \uc704\uce58\uac12\uc744 \ud45c\uc2dc\ud560 \ubfd0\ub9cc\uc544\ub2c8\ub77c &hellip; <\/p>\n<p class=\"link-more\"><a href=\"http:\/\/blog.cedartrees.co.kr\/index.php\/2020\/10\/16\/word2vec-test\/\" class=\"more-link\">\ub354 \ubcf4\uae30<span class=\"screen-reader-text\"> &#8220;Word2Vec \uc2dc\uac01\ud654&#8221;<\/span><\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[18,14],"tags":[98,99,110],"_links":{"self":[{"href":"http:\/\/blog.cedartrees.co.kr\/index.php\/wp-json\/wp\/v2\/posts\/459"}],"collection":[{"href":"http:\/\/blog.cedartrees.co.kr\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/blog.cedartrees.co.kr\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/blog.cedartrees.co.kr\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/blog.cedartrees.co.kr\/index.php\/wp-json\/wp\/v2\/comments?post=459"}],"version-history":[{"count":9,"href":"http:\/\/blog.cedartrees.co.kr\/index.php\/wp-json\/wp\/v2\/posts\/459\/revisions"}],"predecessor-version":[{"id":474,"href":"http:\/\/blog.cedartrees.co.kr\/index.php\/wp-json\/wp\/v2\/posts\/459\/revisions\/474"}],"wp:attachment":[{"href":"http:\/\/blog.cedartrees.co.kr\/index.php\/wp-json\/wp\/v2\/media?parent=459"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/blog.cedartrees.co.kr\/index.php\/wp-json\/wp\/v2\/categories?post=459"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/blog.cedartrees.co.kr\/index.php\/wp-json\/wp\/v2\/tags?post=459"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}