diff --git a/data_science/nlp/word2vec_gensim.ipynb b/data_science/nlp/word2vec_gensim.ipynb new file mode 100644 index 0000000..5024407 --- /dev/null +++ b/data_science/nlp/word2vec_gensim.ipynb @@ -0,0 +1,229 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# pretrained Word2Vec download" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2020-06-03 23:13:08-- https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.16.238\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.16.238|:443... connected.\n", + "HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable\n", + "\n", + " The file is already fully retrieved; nothing to do.\n", + "\n" + ] + } + ], + "source": [ + "!wget -P . -c \"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import gensim" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# load pretrained word2vec\n", + "model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('pal', 0.7476358413696289),\n", + " ('friends', 0.7098034620285034),\n", + " ('buddy', 0.6972494125366211),\n", + " ('dear_friend', 0.6960037350654602),\n", + " ('acquaintance', 0.6843010187149048)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# similar words\n", + "model.most_similar(positive=['friend'], topn=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('queen', 0.7118192911148071)]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# king + woman - man = queen\n", + "model.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "300" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Word2Vec vector dimension\n", + "len(model['friend'])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0.07080078, -0.21386719, 0.15332031, 0.09423828, -0.03442383,\n", + " 0.43359375, -0.16503906, -0.05786133, 0.17578125, -0.08203125,\n", + " 0.24511719, -0.19335938, -0.0255127 , -0.09619141, -0.125 ,\n", + " 0.02575684, 0.16796875, -0.03759766, 0.09472656, -0.04760742,\n", + " 0.20605469, 0.31835938, 0.15917969, -0.17089844, 0.09033203,\n", + " -0.1640625 , -0.15234375, 0.3125 , 0.06298828, -0.24902344,\n", + " 0.15625 , -0.04516602, -0.12890625, -0.00686646, -0.02160645,\n", + " 0.14453125, 0.2734375 , 0.12695312, 0.10742188, 0.11376953,\n", + " 0.14355469, -0.00173187, 0.22851562, -0.03515625, 0.17089844,\n", + " 0.04516602, -0.07958984, -0.08886719, -0.01342773, -0.09667969,\n", + " -0.12597656, 0.10595703, 0.15332031, -0.03808594, 0.02246094,\n", + " 0.01428223, -0.03295898, 0.20703125, -0.03417969, 0.02233887,\n", + " 0.00244141, 0.13476562, -0.01403809, 0.13378906, 0.0201416 ,\n", + " 0.14746094, 0.00759888, -0.18652344, 0.16113281, 0.109375 ,\n", + " 0.14355469, 0.01623535, 0.01867676, 0.09179688, -0.33789062,\n", + " 0.19335938, -0.29101562, -0.00860596, 0.10644531, 0.359375 ,\n", + " 0.25585938, -0.03320312, 0.15625 , -0.24316406, -0.06738281,\n", + " 0.09033203, -0.125 , 0.21777344, -0.02380371, -0.06445312,\n", + " -0.14355469, 0.05664062, -0.12597656, 0.02172852, 0.03833008,\n", + " -0.17578125, -0.08349609, 0.21386719, -0.01855469, -0.23535156,\n", + " -0.14746094, -0.16113281, -0.03125 , -0.10107422, 0.07080078,\n", + " 0.01135254, -0.04370117, 0.07666016, 0.16503906, 0.04541016,\n", + " -0.13867188, 0.13085938, 0.13378906, -0.14453125, 0.12792969,\n", + " -0.06787109, -0.04296875, -0.03369141, 0.10302734, 0.22949219,\n", + " 0.14160156, -0.01153564, -0.00086212, -0.10449219, -0.03710938,\n", + " 0.01928711, 0.16699219, -0.06079102, 0.09814453, 0.0703125 ,\n", + " -0.39648438, -0.23242188, -0.04077148, 0.09570312, -0.0546875 ,\n", + " -0.09814453, 0.09082031, 0.03588867, 0.09228516, 0.3125 ,\n", + " 0.10595703, 0.18847656, -0.11230469, 0.00842285, 0.08935547,\n", + " 0.04663086, -0.25 , -0.03369141, 0.03808594, -0.03710938,\n", + " 0.42773438, 0.10839844, -0.01391602, -0.01965332, -0.04296875,\n", + " -0.11035156, 0.0390625 , 0.04541016, -0.20019531, -0.14355469,\n", + " -0.14257812, 0.03662109, 0.25 , 0.3671875 , -0.12304688,\n", + " -0.0859375 , 0.24902344, -0.21582031, 0.02648926, 0.17871094,\n", + " 0.29296875, 0.21582031, 0.1015625 , 0.00167084, -0.07177734,\n", + " 0.03686523, 0.22851562, -0.125 , 0.17285156, 0.22265625,\n", + " 0.21191406, 0.03686523, 0.09570312, -0.00344849, 0.13183594,\n", + " -0.23925781, 0.00576782, 0.27148438, 0.10400391, 0.0098877 ,\n", + " -0.24511719, 0.21777344, -0.03027344, 0.23046875, 0.11816406,\n", + " 0.1640625 , -0.00109863, 0.00349426, -0.02197266, -0.09179688,\n", + " -0.10351562, 0.06933594, -0.13476562, -0.06201172, 0.14355469,\n", + " -0.10888672, -0.11328125, 0.2109375 , -0.10839844, -0.18261719,\n", + " -0.06689453, -0.265625 , -0.13378906, -0.04296875, -0.17773438,\n", + " 0.00689697, -0.00982666, -0.00640869, -0.12792969, 0.08203125,\n", + " -0.01367188, 0.02734375, 0.12597656, -0.00772095, -0.04614258,\n", + " -0.12255859, 0.16210938, 0.28320312, 0.04296875, -0.05175781,\n", + " -0.16210938, 0.14648438, -0.18359375, -0.24511719, 0.22167969,\n", + " 0.0546875 , -0.10302734, -0.07763672, -0.33984375, -0.05908203,\n", + " -0.0022583 , -0.11962891, -0.3046875 , 0.02233887, 0.02941895,\n", + " 0.37695312, -0.01721191, -0.05932617, 0.30273438, -0.13574219,\n", + " 0.14746094, 0.17089844, 0.16015625, 0.21484375, 0.01013184,\n", + " 0.06738281, -0.12109375, -0.12304688, -0.20117188, 0.02880859,\n", + " -0.00662231, -0.20410156, 0.02001953, -0.15136719, 0.16699219,\n", + " 0.14160156, -0.02331543, 0.14550781, -0.13476562, 0.04785156,\n", + " 0.14160156, 0.03808594, -0.12109375, 0.02770996, -0.0123291 ,\n", + " -0.20410156, -0.06445312, 0.06079102, -0.07519531, -0.28125 ,\n", + " 0.18261719, -0.25390625, -0.0456543 , 0.14160156, -0.0546875 ,\n", + " -0.01477051, -0.38085938, 0.14355469, 0.12255859, 0.14941406,\n", + " -0.03320312, 0.19433594, -0.34375 , -0.24902344, -0.00331116,\n", + " -0.05639648, -0.00079727, -0.21679688, -0.01977539, 0.10644531],\n", + " dtype=float32)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# print word2vec\n", + "model['friend']" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}