From 3c056cb68b31cbf814bb75168f9a106bfc83e09c Mon Sep 17 00:00:00 2001 From: huseinzol05 Date: Mon, 19 Dec 2022 22:24:55 +0800 Subject: [PATCH] finalize 5.0 --- docs/load-augmentation-abstractive.ipynb | 216 +++++-- docs/load-dependency-huggingface.ipynb | 46 +- docs/load-normalizer-abstractive.ipynb | 331 ++++++++-- docs/load-translation-en-ms-huggingface.ipynb | 103 ++- docs/load-translation-ms-en-huggingface.ipynb | 103 ++- ...-translation-noisy-en-ms-huggingface.ipynb | 30 +- ...-translation-noisy-ms-en-huggingface.ipynb | 32 +- .../load-augmentation-abstractive.ipynb | 216 +++++-- .../load-dependency-huggingface.ipynb | 49 +- .../load-translation-en-ms-huggingface.ipynb | 234 +++---- .../load-translation-ms-en-huggingface.ipynb | 103 ++- ...-translation-noisy-ms-en-huggingface.ipynb | 32 +- .../load-normalizer-abstractive.ipynb | 331 ++++++++-- malaya/augmentation/abstractive.py | 12 +- malaya/dependency.py | 6 +- malaya/normalizer/abstractive.py | 32 +- malaya/normalizer/rules.py | 4 +- malaya/supervised/huggingface.py | 4 +- malaya/torch_model/huggingface.py | 24 +- malaya/translation/en_ms.py | 14 +- malaya/translation/ms_en.py | 14 +- session/translation/noisy-hf-t5/README.md | 13 +- .../translation/noisy-hf-t5/export/base.ipynb | 531 ++++++++++----- .../noisy-hf-t5/export/small.ipynb | 591 ++++++++++++----- .../translation/noisy-hf-t5/export/tiny.ipynb | 519 +++++++++------ .../noisy-hf-t5/prepare-data.ipynb | 604 ++++++++++++++++++ 26 files changed, 3134 insertions(+), 1060 deletions(-) create mode 100644 session/translation/noisy-hf-t5/prepare-data.ipynb diff --git a/docs/load-augmentation-abstractive.ipynb b/docs/load-augmentation-abstractive.ipynb index ed6de20c..48d0d38c 100644 --- a/docs/load-augmentation-abstractive.ipynb +++ b/docs/load-augmentation-abstractive.ipynb @@ -50,8 +50,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.02 s, sys: 3.63 s, total: 6.65 s\n", - "Wall time: 2.51 s\n" + "CPU times: user 3.23 s, sys: 3.54 s, total: 6.77 s\n", + "Wall time: 2.25 s\n" ] } ], @@ -106,45 +106,52 @@ " \n", " \n", " Size (MB)\n", - " ROUGE-1\n", - " ROUGE-2\n", - " ROUGE-L\n", + " BLEU\n", + " SacreBLEU Verbose\n", " Suggested length\n", " \n", " \n", " \n", " \n", + " mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2\n", + " 139\n", + " 60.000967\n", + " 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ...\n", + " 256\n", + " \n", + " \n", " mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4\n", - " 242.0\n", - " 0.757218\n", - " 0.496729\n", - " 0.304022\n", - " 256.0\n", + " 242\n", + " 64.062582\n", + " 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ...\n", + " 256\n", " \n", " \n", " mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2\n", - " 892.0\n", - " 0.713227\n", - " 0.470135\n", - " 0.366797\n", - " 256.0\n", + " 892\n", + " 64.583819\n", + " 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ...\n", + " 256\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Size (MB) ROUGE-1 \\\n", - "mesolitica/finetune-noisy-translation-t5-small-... 242.0 0.757218 \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 892.0 0.713227 \n", + " Size (MB) BLEU \\\n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 60.000967 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 242 64.062582 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 892 64.583819 \n", "\n", - " ROUGE-2 ROUGE-L \\\n", - "mesolitica/finetune-noisy-translation-t5-small-... 0.496729 0.304022 \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 0.470135 0.366797 \n", + " SacreBLEU Verbose \\\n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ... \n", + "mesolitica/finetune-noisy-translation-t5-small-... 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ... \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ... \n", "\n", - " Suggested length \n", - "mesolitica/finetune-noisy-translation-t5-small-... 256.0 \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 256.0 " + " Suggested length \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 256 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 256 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 256 " ] }, "execution_count": 3, @@ -193,9 +200,38 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b9b42d7176db4aa78f8145770bbf783f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/826 [00:00\n", " \n", " mesolitica/finetune-dependency-t5-tiny-standard-bahasa-cased\n", - " 61.2\n", - " 0.84929\n", - " 0.8281\n", - " 0.92099\n", + " 143.0\n", + " 0.850607\n", + " 0.783164\n", + " 0.872302\n", " \n", " \n", " mesolitica/finetune-dependency-t5-small-standard-bahasa-cased\n", - " 61.2\n", - " 0.84929\n", - " 0.8281\n", - " 0.92099\n", + " 247.0\n", + " 0.849405\n", + " 0.783103\n", + " 0.866906\n", " \n", " \n", " mesolitica/finetune-dependency-t5-base-standard-bahasa-cased\n", - " 61.2\n", - " 0.84929\n", - " 0.8281\n", - " 0.92099\n", + " 898.0\n", + " 0.852892\n", + " 0.784091\n", + " 0.859712\n", " \n", " \n", "\n", @@ -356,19 +356,19 @@ ], "text/plain": [ " Size (MB) Arc Accuracy \\\n", - "mesolitica/finetune-dependency-t5-tiny-standard... 61.2 0.84929 \n", - "mesolitica/finetune-dependency-t5-small-standar... 61.2 0.84929 \n", - "mesolitica/finetune-dependency-t5-base-standard... 61.2 0.84929 \n", + "mesolitica/finetune-dependency-t5-tiny-standard... 143.0 0.850607 \n", + "mesolitica/finetune-dependency-t5-small-standar... 247.0 0.849405 \n", + "mesolitica/finetune-dependency-t5-base-standard... 898.0 0.852892 \n", "\n", " Types Accuracy \\\n", - "mesolitica/finetune-dependency-t5-tiny-standard... 0.8281 \n", - "mesolitica/finetune-dependency-t5-small-standar... 0.8281 \n", - "mesolitica/finetune-dependency-t5-base-standard... 0.8281 \n", + "mesolitica/finetune-dependency-t5-tiny-standard... 0.783164 \n", + "mesolitica/finetune-dependency-t5-small-standar... 0.783103 \n", + "mesolitica/finetune-dependency-t5-base-standard... 0.784091 \n", "\n", " Root Accuracy \n", - "mesolitica/finetune-dependency-t5-tiny-standard... 0.92099 \n", - "mesolitica/finetune-dependency-t5-small-standar... 0.92099 \n", - "mesolitica/finetune-dependency-t5-base-standard... 0.92099 " + "mesolitica/finetune-dependency-t5-tiny-standard... 0.872302 \n", + "mesolitica/finetune-dependency-t5-small-standar... 0.866906 \n", + "mesolitica/finetune-dependency-t5-base-standard... 0.859712 " ] }, "execution_count": 5, diff --git a/docs/load-normalizer-abstractive.ipynb b/docs/load-normalizer-abstractive.ipynb index 83240427..bd10f282 100644 --- a/docs/load-normalizer-abstractive.ipynb +++ b/docs/load-normalizer-abstractive.ipynb @@ -49,8 +49,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.18 s, sys: 3.4 s, total: 6.58 s\n", - "Wall time: 2.33 s\n" + "CPU times: user 3.24 s, sys: 3.67 s, total: 6.91 s\n", + "Wall time: 2.25 s\n" ] } ], @@ -93,45 +93,52 @@ " \n", " \n", " Size (MB)\n", - " ROUGE-1\n", - " ROUGE-2\n", - " ROUGE-L\n", + " BLEU\n", + " SacreBLEU Verbose\n", " Suggested length\n", " \n", " \n", " \n", " \n", + " mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2\n", + " 139\n", + " 60.000967\n", + " 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ...\n", + " 256\n", + " \n", + " \n", " mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4\n", - " 242.0\n", - " 0.757218\n", - " 0.496729\n", - " 0.304022\n", - " 256.0\n", + " 242\n", + " 64.062582\n", + " 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ...\n", + " 256\n", " \n", " \n", " mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2\n", - " 892.0\n", - " 0.713227\n", - " 0.470135\n", - " 0.366797\n", - " 256.0\n", + " 892\n", + " 64.583819\n", + " 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ...\n", + " 256\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Size (MB) ROUGE-1 \\\n", - "mesolitica/finetune-noisy-translation-t5-small-... 242.0 0.757218 \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 892.0 0.713227 \n", + " Size (MB) BLEU \\\n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 60.000967 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 242 64.062582 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 892 64.583819 \n", "\n", - " ROUGE-2 ROUGE-L \\\n", - "mesolitica/finetune-noisy-translation-t5-small-... 0.496729 0.304022 \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 0.470135 0.366797 \n", + " SacreBLEU Verbose \\\n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ... \n", + "mesolitica/finetune-noisy-translation-t5-small-... 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ... \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ... \n", "\n", - " Suggested length \n", - "mesolitica/finetune-noisy-translation-t5-small-... 256.0 \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 256.0 " + " Suggested length \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 256 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 256 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 256 " ] }, "execution_count": 3, @@ -151,7 +158,7 @@ "\n", "```python\n", "def huggingface(\n", - " model: str = 'mesolitica/finetune-normalizer-t5-small-standard-bahasa-cased',\n", + " model: str = 'mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4',\n", " force_check: bool = True,\n", " use_rules_normalizer: bool = True,\n", " kenlm_model: str = 'bahasa-wiki-news',\n", @@ -170,7 +177,7 @@ "\n", " Parameters\n", " ----------\n", - " model: str, optional (default='mesolitica/finetune-normalizer-t5-small-standard-bahasa-cased')\n", + " model: str, optional (default='mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4')\n", " Check available models at `malaya.normalizer.abstractive.available_huggingface()`.\n", " force_check: bool, optional (default=True)\n", " Force check model one of malaya model.\n", @@ -179,10 +186,12 @@ " kenlm_model: str, optional (default='bahasa-wiki-news')\n", " the model trained on `malaya.language_model.kenlm(model = 'bahasa-wiki-news')`,\n", " but you can use any kenlm model from `malaya.language_model.available_kenlm`.\n", + " Also you can pass as None to skip spelling correction but still apply rules normalizer.\n", " This parameter will be ignored if `use_rules_normalizer=False`.\n", " stem_model: str, optional (default='noisy')\n", " the model trained on `malaya.stem.deep_model(model = 'noisy'),\n", " but you can use any stemmer model from `malaya.stem.available_model`.\n", + " Also you can pass as None to skip stemming but still apply rules normalizer.\n", " This parameter will be ignored if `use_rules_normalizer=False`.\n", " segmenter: Callable, optional (default=None)\n", " segmenter function to segment text, read more at https://malaya.readthedocs.io/en/stable/load-normalizer.html#Use-segmenter\n", @@ -210,6 +219,31 @@ "```" ] }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-12-19 15:05:28.466979: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2022-12-19 15:05:28.472055: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected\n", + "2022-12-19 15:05:28.472078: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: husein-MS-7D31\n", + "2022-12-19 15:05:28.472081: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: husein-MS-7D31\n", + "2022-12-19 15:05:28.472163: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: Not found: was unable to find libcuda.so DSO loaded into this program\n", + "2022-12-19 15:05:28.472205: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.161.3\n" + ] + } + ], + "source": [ + "model_default = malaya.normalizer.abstractive.huggingface(model = 'mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4',)" + ] + }, { "cell_type": "code", "execution_count": 5, @@ -243,22 +277,69 @@ "output_type": "execute_result" } ], + "source": [ + "model_default.generate(['ak tk suka hg'], max_length = 256)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" + ] + }, + { + "data": { + "text/plain": [ + "['Saya tidak suka awak']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "model.generate(['ak tk suka hg'], max_length = 256)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['Dia berada di lorong dalam tetapi dia mahu masuk ke kanan']" + "['Dia berada di lorong dalam tetapi mahu masuk ke kanan']" ] }, - "execution_count": 13, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_default.generate(['Dia kat lane dalam tapi nk masuk kanan'], max_length = 256)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Dia berada di lorong dalam tetapi mahu masuk ke kanan']" + ] + }, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -269,7 +350,27 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['@mesyceres Haha terkejut nak keluarkan semua. Idul Fitri lagi.']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_default.generate(['@mesyceres Haha ayookk keluarkan semuanya. Bentar lagi Idul Fitri .'], max_length = 256)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "metadata": { "scrolled": true }, @@ -280,7 +381,7 @@ "['@mesyceres Haha jom keluarkan semuanya. Idul Fitri lagi.']" ] }, - "execution_count": 14, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -291,16 +392,37 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['Hai kawan-kawan! Saya perasan semalam & hari ini ramai yang dapat biskut ni kan? Jadi hari ini saya ingin berkongsi beberapa post mortem batch pertama kami:']" + "['Hai semua! Saya perasan semalam & hari ni ramai yang dapat biskut ni kan? Jadi hari ini saya ingin berkongsi beberapa post mortem batch pertama kami:']" ] }, - "execution_count": 15, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_default.generate(['Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:'],\n", + " max_length = 256)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Hai semua! Saya perasan semalam & hari ni ramai yang dapat biskut ni kan? Jadi hari ini saya ingin berkongsi beberapa post mortem batch pertama kami:']" + ] + }, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -312,25 +434,66 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['Tak guna budak zaman sekarang ni, nak gosok baju pun kena belajar tiktok.']" + "['Tak guna budak zaman sekarang ni, suruh gosok baju pun kena belajar kat toktok.']" ] }, - "execution_count": 16, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s = 'Punyalah tak guna bebudak zaman sekarang ni sampaikan gosok baju pun kena belajar kat tiktok.'\n", + "model_default.generate([s], max_length = 256)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Tak guna budak zaman sekarang ni, kalau gosok baju pun kena belajar kat tiktok.']" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ "model.generate([s], max_length = 256)" ] }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Sepi juga bila adik beradik dah kahwin / kahwin']" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s = 'Lonely jugak ye when your siblings are married/ getting married'\n", + "model_default.generate([s], max_length = 256)" + ] + }, { "cell_type": "code", "execution_count": 17, @@ -339,7 +502,7 @@ { "data": { "text/plain": [ - "['Sepi juga apabila adik-beradik anda sudah berkahwin/berkahwin']" + "['Sepi juga bila adik beradik dah kahwin/kahwin']" ] }, "execution_count": 17, @@ -348,7 +511,6 @@ } ], "source": [ - "s = 'Lonely jugak ye when your siblings are married/ getting married'\n", "model.generate([s], max_length = 256)" ] }, @@ -360,7 +522,7 @@ { "data": { "text/plain": [ - "['Rasa janggal bila tengok kerajaan cepat buat kerja masa bencana. Dengan mesin yang ada, ia sebenarnya boleh. Ini adalah permulaan yang baik.']" + "['Rasa janggal bila tengok kerajaan cepat buat kerja bencana. Dengan mesin yang ada, ia sebenarnya boleh. Ini adalah permulaan yang baik.']" ] }, "execution_count": 18, @@ -370,6 +532,26 @@ ], "source": [ "s = 'Rasa awkward bila tengok kerajaan laju buat kerja time bencana. With the machineries yang ada, sebenarnya boleh je. This is a good start.'\n", + "model_default.generate([s], max_length = 256)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Rasa janggal bila tengok kerajaan cepat buat kerja bencana. Dengan mesin yang ada, ia sebenarnya boleh. Ini adalah permulaan yang baik.']" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ "model.generate([s], max_length = 256)" ] }, @@ -381,7 +563,7 @@ { "data": { "text/plain": [ - "['kalau boleh elakkan ada perkataan macam ni dalam resume. Bukan sebab apa, sebelum kita pergi ke fasa temuduga, dekat resume kita sendiri, kita di sini untuk memberi kesan yang baik untuk menunjukkan siapa kita sebenarnya']" + "['kalau boleh elakkan ada perkataan macam ni dalam resume. Bukan sebab apa, sebelum kita pergi fasa temuduga, dekat resume sendiri kita kena bagi kesan yang baik untuk menunjukkan siapa kita sebenarnya.']" ] }, "execution_count": 20, @@ -391,7 +573,7 @@ ], "source": [ "s = 'kalau boleh elakkan ada perkataan macam ni dalam resume. Bukan sebab apa, sebelum kita ke fasa interview, dekat resume tu sendiri kita dah kene bagi good impression menunjukkan siapa diri kita yang sebenarnya'\n", - "model.generate([s], max_length = 256)" + "model_default.generate([s], max_length = 256)" ] }, { @@ -402,7 +584,7 @@ { "data": { "text/plain": [ - "['Sudah, bro.. berehat dari politik supaya orang tidak terus dibenci dengan kenyataan yang pelik dan mengelirukan...']" + "['kalau boleh elakkan ada perkataan macam ni dalam resume. Bukan sebab apa, sebelum kita pergi fasa temuduga, dekat resume sendiri kita dah dapat kesan yang baik menunjukkan siapa diri kita yang sebenar']" ] }, "execution_count": 21, @@ -410,8 +592,48 @@ "output_type": "execute_result" } ], + "source": [ + "model.generate([s], max_length = 256)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Sudah cukup, puan... berehatlah dari politik agar tidak terus dibenci orang dengan kenyataan pelik dan mengelirukan...']" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "s = 'Udah2 lah ayoh cik...berehatlah dari politik agar tidak berterusan dibenci orang dgn kenyataan yg pelik dan mengelirukan...'\n", + "model_default.generate([s], max_length = 256)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Itu sahaja, puan... berehatlah dari politik agar tidak terus dibenci orang dengan kenyataan pelik dan mengelirukan...']" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ "model.generate([s], max_length = 256)" ] }, @@ -432,9 +654,9 @@ { "data": { "text/plain": [ - "['Sudah ayoh cik... rehat dari politik supaya tidak terus dibenci orang dengan kenyataan yang pelik dan mengelirukan...',\n", - " 'Sudah-Sudahlah bro.. berehat dari politik supaya orang tidak terus dibenci dengan kenyataan yang pelik dan mengelirukan...',\n", - " 'Itu sahaja.. berehat dari politik supaya tidak terus dibenci orang dengan kenyataan yang pelik dan mengelirukan...']" + "['Ayuh cik... rehat dari politik supaya tidak terus dibenci orang dengan kenyataan yang pelik dan mengelirukan...',\n", + " 'Itu sahaja, puan... rehatkan politik supaya tidak terus dibenci orang dengan kenyataan pelik dan mengelirukan...',\n", + " 'Dah tu pakcik... berehat dari politik supaya orang tidak terus membencinya dengan kenyataan pelik dan mengelirukan...']" ] }, "execution_count": 24, @@ -450,18 +672,18 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['Sudah, bro.. berehat dari politik supaya tidak terus dibenci orang dengan kenyataan yang pelik dan mengelirukan...',\n", - " 'Itu sahaja bro.. rehat daripada politik supaya tidak dibenci orang dengan kenyataan yang pelik dan mengelirukan...',\n", - " 'Itu sahaja bro... berehatlah dari politik supaya orang tidak terus dibenci dengan kenyataan yang pelik dan mengelirukan...']" + "['Itu sahaja, puan... rehat politik supaya tidak terus dibenci oleh orang dengan kenyataan pelik dan mengelirukan...',\n", + " 'Sudah, ayuh cik.. berehat dari politik supaya tidak terus dibenci orang dengan kenyataan pelik dan mengelirukan...',\n", + " 'Itu sahaja, makcik... rehat dari politik supaya tidak terus dibenci orang dengan kenyataan yang pelik dan mengelirukan...']" ] }, - "execution_count": 27, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -470,6 +692,13 @@ "model.generate([s], max_length = 256, do_sample=True, penalty_alpha=0.6, top_k=4, temperature = 0.7,\n", " num_return_sequences=3)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/docs/load-translation-en-ms-huggingface.ipynb b/docs/load-translation-en-ms-huggingface.ipynb index 373d808b..a6e0c65b 100644 --- a/docs/load-translation-en-ms-huggingface.ipynb +++ b/docs/load-translation-en-ms-huggingface.ipynb @@ -49,18 +49,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.54 s, sys: 3.2 s, total: 6.74 s\n", - "Wall time: 2.65 s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/husein/dev/malaya/malaya/tokenizer.py:208: FutureWarning: Possible nested set at position 3372\n", - " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n", - "/home/husein/dev/malaya/malaya/tokenizer.py:208: FutureWarning: Possible nested set at position 3890\n", - " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n" + "CPU times: user 3.16 s, sys: 3.76 s, total: 6.93 s\n", + "Wall time: 2.3 s\n" ] } ], @@ -91,7 +81,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:malaya.translation.en_ms:tested on FLORES200 EN-MS (eng_Latn-zsm_Latn) pair `dev` set, https://github.com/facebookresearch/flores/tree/main/flores200\n" + "INFO:malaya.translation.en_ms:tested on FLORES200 EN-MS (eng_Latn-zsm_Latn) pair `dev` set, https://github.com/facebookresearch/flores/tree/main/flores200\n", + "INFO:malaya.translation.en_ms:for noisy, tested on noisy twitter google translation, https://huggingface.co/datasets/mesolitica/augmentation-test-set\n" ] }, { @@ -124,6 +115,14 @@ " \n", " \n", " \n", + " mesolitica/finetune-translation-t5-super-super-tiny-standard-bahasa-cased\n", + " 23.3\n", + " 36.290743\n", + " 71.2/46.0/30.9/21.0 (BP = 0.950 ratio = 0.951 ...\n", + " 61.89\n", + " 256\n", + " \n", + " \n", " mesolitica/finetune-translation-t5-super-tiny-standard-bahasa-cased\n", " 50.7\n", " 39.188342\n", @@ -155,34 +154,110 @@ " 67.6\n", " 256\n", " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased\n", + " 139\n", + " 41.036414\n", + " 72.9/49.2/34.8/25.0 (BP = 0.977 ratio = 0.977 ...\n", + " 65.58\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-small-bahasa-cased\n", + " 242\n", + " 41.15794\n", + " 72.2/48.8/34.5/24.8 (BP = 0.988 ratio = 0.988 ...\n", + " 65.51\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-base-bahasa-cased\n", + " 892\n", + " 41.827831\n", + " 73.4/50.1/35.7/25.8 (BP = 0.982 ratio = 0.982 ...\n", + " 66.51\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2\n", + " 139\n", + " 60.000967\n", + " 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ...\n", + " None\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4\n", + " 242\n", + " 64.062582\n", + " 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ...\n", + " None\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2\n", + " 892\n", + " 64.583819\n", + " 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ...\n", + " None\n", + " 256\n", + " \n", " \n", "\n", "" ], "text/plain": [ " Size (MB) BLEU \\\n", + "mesolitica/finetune-translation-t5-super-super-... 23.3 36.290743 \n", "mesolitica/finetune-translation-t5-super-tiny-s... 50.7 39.188342 \n", "mesolitica/finetune-translation-t5-tiny-standar... 139 41.625536 \n", "mesolitica/finetune-translation-t5-small-standa... 242 43.937298 \n", "mesolitica/finetune-translation-t5-base-standar... 892 44.173559 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 41.036414 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 242 41.15794 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 892 41.827831 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 60.000967 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 242 64.062582 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 892 64.583819 \n", "\n", " SacreBLEU Verbose \\\n", + "mesolitica/finetune-translation-t5-super-super-... 71.2/46.0/30.9/21.0 (BP = 0.950 ratio = 0.951 ... \n", "mesolitica/finetune-translation-t5-super-tiny-s... 72.6/48.3/33.5/23.6 (BP = 0.960 ratio = 0.961 ... \n", "mesolitica/finetune-translation-t5-tiny-standar... 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ... \n", "mesolitica/finetune-translation-t5-small-standa... 74.9/52.2/37.9/27.7 (BP = 0.976 ratio = 0.977 ... \n", "mesolitica/finetune-translation-t5-base-standar... 74.7/52.3/38.0/28.0 (BP = 0.979 ratio = 0.979 ... \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 72.9/49.2/34.8/25.0 (BP = 0.977 ratio = 0.977 ... \n", + "mesolitica/finetune-noisy-translation-t5-small-... 72.2/48.8/34.5/24.8 (BP = 0.988 ratio = 0.988 ... \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 73.4/50.1/35.7/25.8 (BP = 0.982 ratio = 0.982 ... \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ... \n", + "mesolitica/finetune-noisy-translation-t5-small-... 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ... \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ... \n", "\n", " SacreBLEU-chrF++-FLORES200 \\\n", + "mesolitica/finetune-translation-t5-super-super-... 61.89 \n", "mesolitica/finetune-translation-t5-super-tiny-s... 64.03 \n", "mesolitica/finetune-translation-t5-tiny-standar... 65.7 \n", "mesolitica/finetune-translation-t5-small-standa... 67.43 \n", "mesolitica/finetune-translation-t5-base-standar... 67.6 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 65.58 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 65.51 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 66.51 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... None \n", + "mesolitica/finetune-noisy-translation-t5-small-... None \n", + "mesolitica/finetune-noisy-translation-t5-base-b... None \n", "\n", " Suggested length \n", + "mesolitica/finetune-translation-t5-super-super-... 256 \n", "mesolitica/finetune-translation-t5-super-tiny-s... 256 \n", "mesolitica/finetune-translation-t5-tiny-standar... 256 \n", "mesolitica/finetune-translation-t5-small-standa... 256 \n", - "mesolitica/finetune-translation-t5-base-standar... 256 " + "mesolitica/finetune-translation-t5-base-standar... 256 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 256 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 256 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 256 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 256 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 256 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 256 " ] }, "execution_count": 3, diff --git a/docs/load-translation-ms-en-huggingface.ipynb b/docs/load-translation-ms-en-huggingface.ipynb index 4da3db21..39426448 100644 --- a/docs/load-translation-ms-en-huggingface.ipynb +++ b/docs/load-translation-ms-en-huggingface.ipynb @@ -51,18 +51,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.51 s, sys: 3.29 s, total: 6.8 s\n", - "Wall time: 2.64 s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/husein/dev/malaya/malaya/tokenizer.py:208: FutureWarning: Possible nested set at position 3372\n", - " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n", - "/home/husein/dev/malaya/malaya/tokenizer.py:208: FutureWarning: Possible nested set at position 3890\n", - " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n" + "CPU times: user 3.38 s, sys: 3.55 s, total: 6.93 s\n", + "Wall time: 2.23 s\n" ] } ], @@ -91,7 +81,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:malaya.translation.ms_en:tested on FLORES200 MS-EN (zsm_Latn-eng_Latn) pair, https://github.com/facebookresearch/flores/tree/main/flores200\n" + "INFO:malaya.translation.ms_en:tested on FLORES200 EN-MS (eng_Latn-zsm_Latn) pair `dev` set, https://github.com/facebookresearch/flores/tree/main/flores200\n", + "INFO:malaya.translation.ms_en:for noisy, tested on noisy twitter google translation, https://huggingface.co/datasets/mesolitica/augmentation-test-set\n" ] }, { @@ -124,6 +115,14 @@ " \n", " \n", " \n", + " mesolitica/finetune-translation-t5-super-super-tiny-standard-bahasa-cased\n", + " 23.3\n", + " 30.216144\n", + " 64.9/38.1/24.1/15.3 (BP = 0.978 ratio = 0.978 ...\n", + " 56.46\n", + " 256\n", + " \n", + " \n", " mesolitica/finetune-translation-t5-super-tiny-standard-bahasa-cased\n", " 50.7\n", " 34.105615\n", @@ -155,34 +154,110 @@ " 65.44\n", " 256\n", " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased\n", + " 139\n", + " 39.725134\n", + " 69.8/46.2/32.8/23.6 (BP = 0.999 ratio = 0.999 ...\n", + " None\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-small-bahasa-cased\n", + " 242\n", + " 41.834071\n", + " 71.7/48.7/35.4/26.0 (BP = 0.989 ratio = 0.989 ...\n", + " None\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-base-bahasa-cased\n", + " 892\n", + " 43.432723\n", + " 71.8/49.8/36.6/27.2 (BP = 1.000 ratio = 1.000 ...\n", + " None\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2\n", + " 139\n", + " 60.000967\n", + " 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ...\n", + " None\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4\n", + " 242\n", + " 64.062582\n", + " 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ...\n", + " None\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2\n", + " 892\n", + " 64.583819\n", + " 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ...\n", + " None\n", + " 256\n", + " \n", " \n", "\n", "" ], "text/plain": [ " Size (MB) BLEU \\\n", + "mesolitica/finetune-translation-t5-super-super-... 23.3 30.216144 \n", "mesolitica/finetune-translation-t5-super-tiny-s... 50.7 34.105615 \n", "mesolitica/finetune-translation-t5-tiny-standar... 139 37.260485 \n", "mesolitica/finetune-translation-t5-small-standa... 242 42.010218 \n", "mesolitica/finetune-translation-t5-base-standar... 892 43.408853 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 39.725134 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 242 41.834071 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 892 43.432723 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 60.000967 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 242 64.062582 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 892 64.583819 \n", "\n", " SacreBLEU Verbose \\\n", + "mesolitica/finetune-translation-t5-super-super-... 64.9/38.1/24.1/15.3 (BP = 0.978 ratio = 0.978 ... \n", "mesolitica/finetune-translation-t5-super-tiny-s... 67.3/41.6/27.8/18.7 (BP = 0.982 ratio = 0.982 ... \n", "mesolitica/finetune-translation-t5-tiny-standar... 68.3/44.1/30.5/21.4 (BP = 0.995 ratio = 0.995 ... \n", "mesolitica/finetune-translation-t5-small-standa... 71.7/49.0/35.6/26.1 (BP = 0.989 ratio = 0.989 ... \n", "mesolitica/finetune-translation-t5-base-standar... 72.3/50.5/37.1/27.7 (BP = 0.987 ratio = 0.987 ... \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 69.8/46.2/32.8/23.6 (BP = 0.999 ratio = 0.999 ... \n", + "mesolitica/finetune-noisy-translation-t5-small-... 71.7/48.7/35.4/26.0 (BP = 0.989 ratio = 0.989 ... \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 71.8/49.8/36.6/27.2 (BP = 1.000 ratio = 1.000 ... \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ... \n", + "mesolitica/finetune-noisy-translation-t5-small-... 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ... \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ... \n", "\n", " SacreBLEU-chrF++-FLORES200 \\\n", + "mesolitica/finetune-translation-t5-super-super-... 56.46 \n", "mesolitica/finetune-translation-t5-super-tiny-s... 59.18 \n", "mesolitica/finetune-translation-t5-tiny-standar... 61.29 \n", "mesolitica/finetune-translation-t5-small-standa... 64.67 \n", "mesolitica/finetune-translation-t5-base-standar... 65.44 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... None \n", + "mesolitica/finetune-noisy-translation-t5-small-... None \n", + "mesolitica/finetune-noisy-translation-t5-base-b... None \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... None \n", + "mesolitica/finetune-noisy-translation-t5-small-... None \n", + "mesolitica/finetune-noisy-translation-t5-base-b... None \n", "\n", " Suggested length \n", + "mesolitica/finetune-translation-t5-super-super-... 256 \n", "mesolitica/finetune-translation-t5-super-tiny-s... 256 \n", "mesolitica/finetune-translation-t5-tiny-standar... 256 \n", "mesolitica/finetune-translation-t5-small-standa... 256 \n", - "mesolitica/finetune-translation-t5-base-standar... 256 " + "mesolitica/finetune-translation-t5-base-standar... 256 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 256 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 256 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 256 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 256 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 256 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 256 " ] }, "execution_count": 3, diff --git a/docs/load-translation-noisy-en-ms-huggingface.ipynb b/docs/load-translation-noisy-en-ms-huggingface.ipynb index 3f509bd4..f1178caf 100644 --- a/docs/load-translation-noisy-en-ms-huggingface.ipynb +++ b/docs/load-translation-noisy-en-ms-huggingface.ipynb @@ -38,8 +38,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.22 s, sys: 3.45 s, total: 6.67 s\n", - "Wall time: 2.43 s\n" + "CPU times: user 3.27 s, sys: 3.58 s, total: 6.85 s\n", + "Wall time: 2.26 s\n" ] } ], @@ -69,7 +69,7 @@ "output_type": "stream", "text": [ "INFO:malaya.translation.en_ms:tested on FLORES200 EN-MS (eng_Latn-zsm_Latn) pair `dev` set, https://github.com/facebookresearch/flores/tree/main/flores200\n", - "INFO:malaya.translation.en_ms:for noisy, tested on noisy augmented FLORES200 EN-MS (eng_Latn-zsm_Latn) pair `dev` set, https://github.com/huseinzol05/malay-dataset/tree/master/translation/nllb-noisy-dev-augmentation\n" + "INFO:malaya.translation.en_ms:for noisy, tested on noisy twitter google translation, https://huggingface.co/datasets/mesolitica/augmentation-test-set\n" ] }, { @@ -168,24 +168,24 @@ " \n", " mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2\n", " 139\n", - " 41.625536\n", - " 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ...\n", + " 60.000967\n", + " 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ...\n", " None\n", " 256\n", " \n", " \n", " mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4\n", " 242\n", - " 41.625536\n", - " 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ...\n", + " 64.062582\n", + " 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ...\n", " None\n", " 256\n", " \n", " \n", " mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2\n", " 892\n", - " 41.625536\n", - " 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ...\n", + " 64.583819\n", + " 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ...\n", " None\n", " 256\n", " \n", @@ -203,9 +203,9 @@ "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 41.036414 \n", "mesolitica/finetune-noisy-translation-t5-small-... 242 41.15794 \n", "mesolitica/finetune-noisy-translation-t5-base-b... 892 41.827831 \n", - "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 41.625536 \n", - "mesolitica/finetune-noisy-translation-t5-small-... 242 41.625536 \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 892 41.625536 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 60.000967 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 242 64.062582 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 892 64.583819 \n", "\n", " SacreBLEU Verbose \\\n", "mesolitica/finetune-translation-t5-super-super-... 71.2/46.0/30.9/21.0 (BP = 0.950 ratio = 0.951 ... \n", @@ -216,9 +216,9 @@ "mesolitica/finetune-noisy-translation-t5-tiny-b... 72.9/49.2/34.8/25.0 (BP = 0.977 ratio = 0.977 ... \n", "mesolitica/finetune-noisy-translation-t5-small-... 72.2/48.8/34.5/24.8 (BP = 0.988 ratio = 0.988 ... \n", "mesolitica/finetune-noisy-translation-t5-base-b... 73.4/50.1/35.7/25.8 (BP = 0.982 ratio = 0.982 ... \n", - "mesolitica/finetune-noisy-translation-t5-tiny-b... 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ... \n", - "mesolitica/finetune-noisy-translation-t5-small-... 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ... \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ... \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ... \n", + "mesolitica/finetune-noisy-translation-t5-small-... 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ... \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ... \n", "\n", " SacreBLEU-chrF++-FLORES200 \\\n", "mesolitica/finetune-translation-t5-super-super-... 61.89 \n", diff --git a/docs/load-translation-noisy-ms-en-huggingface.ipynb b/docs/load-translation-noisy-ms-en-huggingface.ipynb index 576dab63..da6f06cf 100644 --- a/docs/load-translation-noisy-ms-en-huggingface.ipynb +++ b/docs/load-translation-noisy-ms-en-huggingface.ipynb @@ -38,8 +38,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.25 s, sys: 3.34 s, total: 6.59 s\n", - "Wall time: 2.44 s\n" + "CPU times: user 3.35 s, sys: 3.49 s, total: 6.84 s\n", + "Wall time: 2.33 s\n" ] } ], @@ -159,7 +159,7 @@ " \n", " \n", " mesolitica/finetune-noisy-translation-t5-base-bahasa-cased\n", - " 242\n", + " 892\n", " 43.432723\n", " 71.8/49.8/36.6/27.2 (BP = 1.000 ratio = 1.000 ...\n", " None\n", @@ -168,24 +168,24 @@ " \n", " mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2\n", " 139\n", - " 41.625536\n", - " 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ...\n", + " 60.000967\n", + " 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ...\n", " None\n", " 256\n", " \n", " \n", " mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4\n", " 242\n", - " 41.625536\n", - " 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ...\n", + " 64.062582\n", + " 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ...\n", " None\n", " 256\n", " \n", " \n", " mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2\n", " 892\n", - " 41.625536\n", - " 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ...\n", + " 64.583819\n", + " 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ...\n", " None\n", " 256\n", " \n", @@ -202,10 +202,10 @@ "mesolitica/finetune-translation-t5-base-standar... 892 43.408853 \n", "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 39.725134 \n", "mesolitica/finetune-noisy-translation-t5-small-... 242 41.834071 \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 242 43.432723 \n", - "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 41.625536 \n", - "mesolitica/finetune-noisy-translation-t5-small-... 242 41.625536 \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 892 41.625536 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 892 43.432723 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 60.000967 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 242 64.062582 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 892 64.583819 \n", "\n", " SacreBLEU Verbose \\\n", "mesolitica/finetune-translation-t5-super-super-... 64.9/38.1/24.1/15.3 (BP = 0.978 ratio = 0.978 ... \n", @@ -216,9 +216,9 @@ "mesolitica/finetune-noisy-translation-t5-tiny-b... 69.8/46.2/32.8/23.6 (BP = 0.999 ratio = 0.999 ... \n", "mesolitica/finetune-noisy-translation-t5-small-... 71.7/48.7/35.4/26.0 (BP = 0.989 ratio = 0.989 ... \n", "mesolitica/finetune-noisy-translation-t5-base-b... 71.8/49.8/36.6/27.2 (BP = 1.000 ratio = 1.000 ... \n", - "mesolitica/finetune-noisy-translation-t5-tiny-b... 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ... \n", - "mesolitica/finetune-noisy-translation-t5-small-... 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ... \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ... \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ... \n", + "mesolitica/finetune-noisy-translation-t5-small-... 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ... \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ... \n", "\n", " SacreBLEU-chrF++-FLORES200 \\\n", "mesolitica/finetune-translation-t5-super-super-... 56.46 \n", diff --git a/example/augmentation-abstractive/load-augmentation-abstractive.ipynb b/example/augmentation-abstractive/load-augmentation-abstractive.ipynb index ed6de20c..48d0d38c 100644 --- a/example/augmentation-abstractive/load-augmentation-abstractive.ipynb +++ b/example/augmentation-abstractive/load-augmentation-abstractive.ipynb @@ -50,8 +50,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.02 s, sys: 3.63 s, total: 6.65 s\n", - "Wall time: 2.51 s\n" + "CPU times: user 3.23 s, sys: 3.54 s, total: 6.77 s\n", + "Wall time: 2.25 s\n" ] } ], @@ -106,45 +106,52 @@ " \n", " \n", " Size (MB)\n", - " ROUGE-1\n", - " ROUGE-2\n", - " ROUGE-L\n", + " BLEU\n", + " SacreBLEU Verbose\n", " Suggested length\n", " \n", " \n", " \n", " \n", + " mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2\n", + " 139\n", + " 60.000967\n", + " 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ...\n", + " 256\n", + " \n", + " \n", " mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4\n", - " 242.0\n", - " 0.757218\n", - " 0.496729\n", - " 0.304022\n", - " 256.0\n", + " 242\n", + " 64.062582\n", + " 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ...\n", + " 256\n", " \n", " \n", " mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2\n", - " 892.0\n", - " 0.713227\n", - " 0.470135\n", - " 0.366797\n", - " 256.0\n", + " 892\n", + " 64.583819\n", + " 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ...\n", + " 256\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Size (MB) ROUGE-1 \\\n", - "mesolitica/finetune-noisy-translation-t5-small-... 242.0 0.757218 \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 892.0 0.713227 \n", + " Size (MB) BLEU \\\n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 60.000967 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 242 64.062582 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 892 64.583819 \n", "\n", - " ROUGE-2 ROUGE-L \\\n", - "mesolitica/finetune-noisy-translation-t5-small-... 0.496729 0.304022 \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 0.470135 0.366797 \n", + " SacreBLEU Verbose \\\n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ... \n", + "mesolitica/finetune-noisy-translation-t5-small-... 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ... \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ... \n", "\n", - " Suggested length \n", - "mesolitica/finetune-noisy-translation-t5-small-... 256.0 \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 256.0 " + " Suggested length \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 256 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 256 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 256 " ] }, "execution_count": 3, @@ -193,9 +200,38 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b9b42d7176db4aa78f8145770bbf783f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/826 [00:00\n", " \n", " mesolitica/finetune-dependency-t5-tiny-standard-bahasa-cased\n", - " 61.2\n", - " 0.84929\n", - " 0.8281\n", - " 0.92099\n", + " 143.0\n", + " 0.850607\n", + " 0.783164\n", + " 0.872302\n", " \n", " \n", " mesolitica/finetune-dependency-t5-small-standard-bahasa-cased\n", - " 61.2\n", - " 0.84929\n", - " 0.8281\n", - " 0.92099\n", + " 247.0\n", + " 0.849405\n", + " 0.783103\n", + " 0.866906\n", " \n", " \n", " mesolitica/finetune-dependency-t5-base-standard-bahasa-cased\n", - " 61.2\n", - " 0.84929\n", - " 0.8281\n", - " 0.92099\n", + " 898.0\n", + " 0.852892\n", + " 0.784091\n", + " 0.859712\n", " \n", " \n", "\n", @@ -356,19 +356,19 @@ ], "text/plain": [ " Size (MB) Arc Accuracy \\\n", - "mesolitica/finetune-dependency-t5-tiny-standard... 61.2 0.84929 \n", - "mesolitica/finetune-dependency-t5-small-standar... 61.2 0.84929 \n", - "mesolitica/finetune-dependency-t5-base-standard... 61.2 0.84929 \n", + "mesolitica/finetune-dependency-t5-tiny-standard... 143.0 0.850607 \n", + "mesolitica/finetune-dependency-t5-small-standar... 247.0 0.849405 \n", + "mesolitica/finetune-dependency-t5-base-standard... 898.0 0.852892 \n", "\n", " Types Accuracy \\\n", - "mesolitica/finetune-dependency-t5-tiny-standard... 0.8281 \n", - "mesolitica/finetune-dependency-t5-small-standar... 0.8281 \n", - "mesolitica/finetune-dependency-t5-base-standard... 0.8281 \n", + "mesolitica/finetune-dependency-t5-tiny-standard... 0.783164 \n", + "mesolitica/finetune-dependency-t5-small-standar... 0.783103 \n", + "mesolitica/finetune-dependency-t5-base-standard... 0.784091 \n", "\n", " Root Accuracy \n", - "mesolitica/finetune-dependency-t5-tiny-standard... 0.92099 \n", - "mesolitica/finetune-dependency-t5-small-standar... 0.92099 \n", - "mesolitica/finetune-dependency-t5-base-standard... 0.92099 " + "mesolitica/finetune-dependency-t5-tiny-standard... 0.872302 \n", + "mesolitica/finetune-dependency-t5-small-standar... 0.866906 \n", + "mesolitica/finetune-dependency-t5-base-standard... 0.859712 " ] }, "execution_count": 5, @@ -3625,5 +3625,4 @@ } }, "nbformat": 4, - "nbformat_minor": 2 -} + "nbforma \ No newline at end of file diff --git a/example/en-ms-translation-huggingface/load-translation-en-ms-huggingface.ipynb b/example/en-ms-translation-huggingface/load-translation-en-ms-huggingface.ipynb index 373d808b..0bf28f77 100644 --- a/example/en-ms-translation-huggingface/load-translation-en-ms-huggingface.ipynb +++ b/example/en-ms-translation-huggingface/load-translation-en-ms-huggingface.ipynb @@ -49,18 +49,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.54 s, sys: 3.2 s, total: 6.74 s\n", - "Wall time: 2.65 s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/husein/dev/malaya/malaya/tokenizer.py:208: FutureWarning: Possible nested set at position 3372\n", - " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n", - "/home/husein/dev/malaya/malaya/tokenizer.py:208: FutureWarning: Possible nested set at position 3890\n", - " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n" + "CPU times: user 3.16 s, sys: 3.76 s, total: 6.93 s\n", + "Wall time: 2.3 s\n" ] } ], @@ -91,7 +81,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:malaya.translation.en_ms:tested on FLORES200 EN-MS (eng_Latn-zsm_Latn) pair `dev` set, https://github.com/facebookresearch/flores/tree/main/flores200\n" + "INFO:malaya.translation.en_ms:tested on FLORES200 EN-MS (eng_Latn-zsm_Latn) pair `dev` set, https://github.com/facebookresearch/flores/tree/main/flores200\n", + "INFO:malaya.translation.en_ms:for noisy, tested on noisy twitter google translation, https://huggingface.co/datasets/mesolitica/augmentation-test-set\n" ] }, { @@ -124,6 +115,14 @@ " \n", " \n", " \n", + " mesolitica/finetune-translation-t5-super-super-tiny-standard-bahasa-cased\n", + " 23.3\n", + " 36.290743\n", + " 71.2/46.0/30.9/21.0 (BP = 0.950 ratio = 0.951 ...\n", + " 61.89\n", + " 256\n", + " \n", + " \n", " mesolitica/finetune-translation-t5-super-tiny-standard-bahasa-cased\n", " 50.7\n", " 39.188342\n", @@ -155,34 +154,110 @@ " 67.6\n", " 256\n", " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased\n", + " 139\n", + " 41.036414\n", + " 72.9/49.2/34.8/25.0 (BP = 0.977 ratio = 0.977 ...\n", + " 65.58\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-small-bahasa-cased\n", + " 242\n", + " 41.15794\n", + " 72.2/48.8/34.5/24.8 (BP = 0.988 ratio = 0.988 ...\n", + " 65.51\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-base-bahasa-cased\n", + " 892\n", + " 41.827831\n", + " 73.4/50.1/35.7/25.8 (BP = 0.982 ratio = 0.982 ...\n", + " 66.51\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2\n", + " 139\n", + " 60.000967\n", + " 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ...\n", + " None\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4\n", + " 242\n", + " 64.062582\n", + " 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ...\n", + " None\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2\n", + " 892\n", + " 64.583819\n", + " 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ...\n", + " None\n", + " 256\n", + " \n", " \n", "\n", "" ], "text/plain": [ " Size (MB) BLEU \\\n", + "mesolitica/finetune-translation-t5-super-super-... 23.3 36.290743 \n", "mesolitica/finetune-translation-t5-super-tiny-s... 50.7 39.188342 \n", "mesolitica/finetune-translation-t5-tiny-standar... 139 41.625536 \n", "mesolitica/finetune-translation-t5-small-standa... 242 43.937298 \n", "mesolitica/finetune-translation-t5-base-standar... 892 44.173559 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 41.036414 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 242 41.15794 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 892 41.827831 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 60.000967 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 242 64.062582 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 892 64.583819 \n", "\n", " SacreBLEU Verbose \\\n", + "mesolitica/finetune-translation-t5-super-super-... 71.2/46.0/30.9/21.0 (BP = 0.950 ratio = 0.951 ... \n", "mesolitica/finetune-translation-t5-super-tiny-s... 72.6/48.3/33.5/23.6 (BP = 0.960 ratio = 0.961 ... \n", "mesolitica/finetune-translation-t5-tiny-standar... 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ... \n", "mesolitica/finetune-translation-t5-small-standa... 74.9/52.2/37.9/27.7 (BP = 0.976 ratio = 0.977 ... \n", "mesolitica/finetune-translation-t5-base-standar... 74.7/52.3/38.0/28.0 (BP = 0.979 ratio = 0.979 ... \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 72.9/49.2/34.8/25.0 (BP = 0.977 ratio = 0.977 ... \n", + "mesolitica/finetune-noisy-translation-t5-small-... 72.2/48.8/34.5/24.8 (BP = 0.988 ratio = 0.988 ... \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 73.4/50.1/35.7/25.8 (BP = 0.982 ratio = 0.982 ... \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ... \n", + "mesolitica/finetune-noisy-translation-t5-small-... 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ... \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ... \n", "\n", " SacreBLEU-chrF++-FLORES200 \\\n", + "mesolitica/finetune-translation-t5-super-super-... 61.89 \n", "mesolitica/finetune-translation-t5-super-tiny-s... 64.03 \n", "mesolitica/finetune-translation-t5-tiny-standar... 65.7 \n", "mesolitica/finetune-translation-t5-small-standa... 67.43 \n", "mesolitica/finetune-translation-t5-base-standar... 67.6 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 65.58 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 65.51 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 66.51 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... None \n", + "mesolitica/finetune-noisy-translation-t5-small-... None \n", + "mesolitica/finetune-noisy-translation-t5-base-b... None \n", "\n", " Suggested length \n", + "mesolitica/finetune-translation-t5-super-super-... 256 \n", "mesolitica/finetune-translation-t5-super-tiny-s... 256 \n", "mesolitica/finetune-translation-t5-tiny-standar... 256 \n", "mesolitica/finetune-translation-t5-small-standa... 256 \n", - "mesolitica/finetune-translation-t5-base-standar... 256 " + "mesolitica/finetune-translation-t5-base-standar... 256 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 256 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 256 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 256 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 256 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 256 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 256 " ] }, "execution_count": 3, @@ -471,133 +546,4 @@ " 'sumbangan di New York. \"Saya memerintahkan Black Lives Matter Foundation '\n", " 'untuk berhenti menerima sumbangan secara haram yang ditujukan untuk gerakan '\n", " '#BlackLivesMatter. Yayasan ini tidak berafiliasi dengan gerakan itu, namun '\n", - " 'ia menerima banyak sumbangan dan menipu muhibah,\" tweet James.',\n", - " 'Di antara inisiatif yang dicadangkan adalah kerangka pelabelan makanan '\n", - " 'lestari, reformasi makanan yang diproses, dan bab kelestarian dalam semua '\n", - " 'perjanjian perdagangan dua hala EU. EU juga merancang untuk menerbitkan '\n", - " 'cadangan untuk kerangka perundangan untuk sistem makanan lestari pada tahun '\n", - " '2023 untuk memastikan semua makanan di pasaran EU menjadi semakin lestari.',\n", - " 'Halaman ini berkongsi artikel terbaik saya untuk dibaca mengenai topik '\n", - " 'seperti kesihatan, kebahagiaan, kreativiti, produktiviti dan banyak lagi. '\n", - " 'Soalan utama yang mendorong karya saya adalah, Bagaimana kita dapat hidup '\n", - " 'lebih baik? \"Untuk menjawab soalan itu, saya ingin menulis mengenai kaedah '\n", - " 'berasaskan sains untuk menyelesaikan masalah praktikal.']\n", - "CPU times: user 13.8 s, sys: 16 ms, total: 13.9 s\n", - "Wall time: 1.21 s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "pprint(transformer_huggingface.generate([string_news1, string_news2, string_news3, string_article1],\n", - " max_length = 1000))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### compare with Google translate using googletrans\n", - "\n", - "Install it by,\n", - "\n", - "```bash\n", - "pip3 install googletrans==4.0.0rc1\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "from googletrans import Translator\n", - "\n", - "translator = Translator()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "strings = [string_news1, string_news2, string_news3, string_article1]" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "KUALA LUMPUR, 1 Julai - Datuk Seri Anwar Ibrahim tidak sesuai sebagai calon Perdana Menteri kerana dia tidak \"popular\" di kalangan orang Melayu, Tun Dr Mahathir Mohamad mendakwa.Bekas Perdana Menteri dilaporkan berkata presiden PKR memerlukan seseorang seperti dirinya untuk memperoleh sokongan daripada orang Melayu dan memenangi pilihan raya.\n", - "(CNN) Peguam Negara New York, Letitia James pada hari Isnin mengarahkan Yayasan Black Lives Matter - yang dikatakannya tidak bergabung dengan gerakan Black Lives Matter yang lebih besar - untuk berhenti mengumpul sumbangan di New York.\"Saya mengarahkan Yayasan Black Lives Matter untuk berhenti menerima sumbangan secara haram yang dimaksudkan untuk gerakan #BlackLivesMatter.\n", - "Di antara inisiatif yang luas yang dicadangkan adalah rangka kerja pelabelan makanan yang mampan, pembaharuan makanan yang diproses, dan bab kemampanan dalam semua perjanjian perdagangan dua hala EU.EU juga merancang untuk menerbitkan cadangan untuk rangka kerja perundangan untuk sistem makanan lestari menjelang 2023 untuk memastikan semua makanan di pasaran EU menjadi semakin mampan.\n", - "Halaman ini berkongsi artikel terbaik saya untuk membaca topik seperti kesihatan, kebahagiaan, kreativiti, produktiviti dan banyak lagi.Soalan utama yang mendorong kerja saya adalah, \"Bagaimana kita dapat hidup lebih baik?\"Untuk menjawab soalan itu, saya ingin menulis tentang cara berasaskan sains untuk menyelesaikan masalah praktikal.\n" - ] - } - ], - "source": [ - "for t in strings:\n", - " r = translator.translate(t, src='en', dest = 'ms')\n", - " print(r.text)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - }, - "varInspector": { - "cols": { - "lenName": 16, - "lenType": 16, - "lenVar": 40 - }, - "kernels_config": { - "python": { - "delete_cmd_postfix": "", - "delete_cmd_prefix": "del ", - "library": "var_list.py", - "varRefreshCmd": "print(var_dic_list())" - }, - "r": { - "delete_cmd_postfix": ") ", - "delete_cmd_prefix": "rm(", - "library": "var_list.r", - "varRefreshCmd": "cat(var_dic_list()) " - } - }, - "types_to_exclude": [ - "module", - "function", - "builtin_function_or_method", - "instance", - "_Feature" - ], - "window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + " 'ia menerima banyak sumbangan dan \ No newline at end of file diff --git a/example/ms-en-translation-huggingface/load-translation-ms-en-huggingface.ipynb b/example/ms-en-translation-huggingface/load-translation-ms-en-huggingface.ipynb index 4da3db21..39426448 100644 --- a/example/ms-en-translation-huggingface/load-translation-ms-en-huggingface.ipynb +++ b/example/ms-en-translation-huggingface/load-translation-ms-en-huggingface.ipynb @@ -51,18 +51,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.51 s, sys: 3.29 s, total: 6.8 s\n", - "Wall time: 2.64 s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/husein/dev/malaya/malaya/tokenizer.py:208: FutureWarning: Possible nested set at position 3372\n", - " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n", - "/home/husein/dev/malaya/malaya/tokenizer.py:208: FutureWarning: Possible nested set at position 3890\n", - " self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n" + "CPU times: user 3.38 s, sys: 3.55 s, total: 6.93 s\n", + "Wall time: 2.23 s\n" ] } ], @@ -91,7 +81,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:malaya.translation.ms_en:tested on FLORES200 MS-EN (zsm_Latn-eng_Latn) pair, https://github.com/facebookresearch/flores/tree/main/flores200\n" + "INFO:malaya.translation.ms_en:tested on FLORES200 EN-MS (eng_Latn-zsm_Latn) pair `dev` set, https://github.com/facebookresearch/flores/tree/main/flores200\n", + "INFO:malaya.translation.ms_en:for noisy, tested on noisy twitter google translation, https://huggingface.co/datasets/mesolitica/augmentation-test-set\n" ] }, { @@ -124,6 +115,14 @@ " \n", " \n", " \n", + " mesolitica/finetune-translation-t5-super-super-tiny-standard-bahasa-cased\n", + " 23.3\n", + " 30.216144\n", + " 64.9/38.1/24.1/15.3 (BP = 0.978 ratio = 0.978 ...\n", + " 56.46\n", + " 256\n", + " \n", + " \n", " mesolitica/finetune-translation-t5-super-tiny-standard-bahasa-cased\n", " 50.7\n", " 34.105615\n", @@ -155,34 +154,110 @@ " 65.44\n", " 256\n", " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased\n", + " 139\n", + " 39.725134\n", + " 69.8/46.2/32.8/23.6 (BP = 0.999 ratio = 0.999 ...\n", + " None\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-small-bahasa-cased\n", + " 242\n", + " 41.834071\n", + " 71.7/48.7/35.4/26.0 (BP = 0.989 ratio = 0.989 ...\n", + " None\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-base-bahasa-cased\n", + " 892\n", + " 43.432723\n", + " 71.8/49.8/36.6/27.2 (BP = 1.000 ratio = 1.000 ...\n", + " None\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2\n", + " 139\n", + " 60.000967\n", + " 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ...\n", + " None\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4\n", + " 242\n", + " 64.062582\n", + " 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ...\n", + " None\n", + " 256\n", + " \n", + " \n", + " mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2\n", + " 892\n", + " 64.583819\n", + " 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ...\n", + " None\n", + " 256\n", + " \n", " \n", "\n", "" ], "text/plain": [ " Size (MB) BLEU \\\n", + "mesolitica/finetune-translation-t5-super-super-... 23.3 30.216144 \n", "mesolitica/finetune-translation-t5-super-tiny-s... 50.7 34.105615 \n", "mesolitica/finetune-translation-t5-tiny-standar... 139 37.260485 \n", "mesolitica/finetune-translation-t5-small-standa... 242 42.010218 \n", "mesolitica/finetune-translation-t5-base-standar... 892 43.408853 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 39.725134 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 242 41.834071 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 892 43.432723 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 60.000967 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 242 64.062582 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 892 64.583819 \n", "\n", " SacreBLEU Verbose \\\n", + "mesolitica/finetune-translation-t5-super-super-... 64.9/38.1/24.1/15.3 (BP = 0.978 ratio = 0.978 ... \n", "mesolitica/finetune-translation-t5-super-tiny-s... 67.3/41.6/27.8/18.7 (BP = 0.982 ratio = 0.982 ... \n", "mesolitica/finetune-translation-t5-tiny-standar... 68.3/44.1/30.5/21.4 (BP = 0.995 ratio = 0.995 ... \n", "mesolitica/finetune-translation-t5-small-standa... 71.7/49.0/35.6/26.1 (BP = 0.989 ratio = 0.989 ... \n", "mesolitica/finetune-translation-t5-base-standar... 72.3/50.5/37.1/27.7 (BP = 0.987 ratio = 0.987 ... \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 69.8/46.2/32.8/23.6 (BP = 0.999 ratio = 0.999 ... \n", + "mesolitica/finetune-noisy-translation-t5-small-... 71.7/48.7/35.4/26.0 (BP = 0.989 ratio = 0.989 ... \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 71.8/49.8/36.6/27.2 (BP = 1.000 ratio = 1.000 ... \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ... \n", + "mesolitica/finetune-noisy-translation-t5-small-... 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ... \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ... \n", "\n", " SacreBLEU-chrF++-FLORES200 \\\n", + "mesolitica/finetune-translation-t5-super-super-... 56.46 \n", "mesolitica/finetune-translation-t5-super-tiny-s... 59.18 \n", "mesolitica/finetune-translation-t5-tiny-standar... 61.29 \n", "mesolitica/finetune-translation-t5-small-standa... 64.67 \n", "mesolitica/finetune-translation-t5-base-standar... 65.44 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... None \n", + "mesolitica/finetune-noisy-translation-t5-small-... None \n", + "mesolitica/finetune-noisy-translation-t5-base-b... None \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... None \n", + "mesolitica/finetune-noisy-translation-t5-small-... None \n", + "mesolitica/finetune-noisy-translation-t5-base-b... None \n", "\n", " Suggested length \n", + "mesolitica/finetune-translation-t5-super-super-... 256 \n", "mesolitica/finetune-translation-t5-super-tiny-s... 256 \n", "mesolitica/finetune-translation-t5-tiny-standar... 256 \n", "mesolitica/finetune-translation-t5-small-standa... 256 \n", - "mesolitica/finetune-translation-t5-base-standar... 256 " + "mesolitica/finetune-translation-t5-base-standar... 256 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 256 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 256 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 256 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 256 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 256 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 256 " ] }, "execution_count": 3, diff --git a/example/noisy-ms-en-translation-huggingface/load-translation-noisy-ms-en-huggingface.ipynb b/example/noisy-ms-en-translation-huggingface/load-translation-noisy-ms-en-huggingface.ipynb index 576dab63..da6f06cf 100644 --- a/example/noisy-ms-en-translation-huggingface/load-translation-noisy-ms-en-huggingface.ipynb +++ b/example/noisy-ms-en-translation-huggingface/load-translation-noisy-ms-en-huggingface.ipynb @@ -38,8 +38,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.25 s, sys: 3.34 s, total: 6.59 s\n", - "Wall time: 2.44 s\n" + "CPU times: user 3.35 s, sys: 3.49 s, total: 6.84 s\n", + "Wall time: 2.33 s\n" ] } ], @@ -159,7 +159,7 @@ " \n", " \n", " mesolitica/finetune-noisy-translation-t5-base-bahasa-cased\n", - " 242\n", + " 892\n", " 43.432723\n", " 71.8/49.8/36.6/27.2 (BP = 1.000 ratio = 1.000 ...\n", " None\n", @@ -168,24 +168,24 @@ " \n", " mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2\n", " 139\n", - " 41.625536\n", - " 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ...\n", + " 60.000967\n", + " 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ...\n", " None\n", " 256\n", " \n", " \n", " mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4\n", " 242\n", - " 41.625536\n", - " 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ...\n", + " 64.062582\n", + " 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ...\n", " None\n", " 256\n", " \n", " \n", " mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2\n", " 892\n", - " 41.625536\n", - " 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ...\n", + " 64.583819\n", + " 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ...\n", " None\n", " 256\n", " \n", @@ -202,10 +202,10 @@ "mesolitica/finetune-translation-t5-base-standar... 892 43.408853 \n", "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 39.725134 \n", "mesolitica/finetune-noisy-translation-t5-small-... 242 41.834071 \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 242 43.432723 \n", - "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 41.625536 \n", - "mesolitica/finetune-noisy-translation-t5-small-... 242 41.625536 \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 892 41.625536 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 892 43.432723 \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 60.000967 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 242 64.062582 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 892 64.583819 \n", "\n", " SacreBLEU Verbose \\\n", "mesolitica/finetune-translation-t5-super-super-... 64.9/38.1/24.1/15.3 (BP = 0.978 ratio = 0.978 ... \n", @@ -216,9 +216,9 @@ "mesolitica/finetune-noisy-translation-t5-tiny-b... 69.8/46.2/32.8/23.6 (BP = 0.999 ratio = 0.999 ... \n", "mesolitica/finetune-noisy-translation-t5-small-... 71.7/48.7/35.4/26.0 (BP = 0.989 ratio = 0.989 ... \n", "mesolitica/finetune-noisy-translation-t5-base-b... 71.8/49.8/36.6/27.2 (BP = 1.000 ratio = 1.000 ... \n", - "mesolitica/finetune-noisy-translation-t5-tiny-b... 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ... \n", - "mesolitica/finetune-noisy-translation-t5-small-... 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ... \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 ... \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ... \n", + "mesolitica/finetune-noisy-translation-t5-small-... 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ... \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ... \n", "\n", " SacreBLEU-chrF++-FLORES200 \\\n", "mesolitica/finetune-translation-t5-super-super-... 56.46 \n", diff --git a/example/normalizer-abstractive/load-normalizer-abstractive.ipynb b/example/normalizer-abstractive/load-normalizer-abstractive.ipynb index 83240427..bd10f282 100644 --- a/example/normalizer-abstractive/load-normalizer-abstractive.ipynb +++ b/example/normalizer-abstractive/load-normalizer-abstractive.ipynb @@ -49,8 +49,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.18 s, sys: 3.4 s, total: 6.58 s\n", - "Wall time: 2.33 s\n" + "CPU times: user 3.24 s, sys: 3.67 s, total: 6.91 s\n", + "Wall time: 2.25 s\n" ] } ], @@ -93,45 +93,52 @@ " \n", " \n", " Size (MB)\n", - " ROUGE-1\n", - " ROUGE-2\n", - " ROUGE-L\n", + " BLEU\n", + " SacreBLEU Verbose\n", " Suggested length\n", " \n", " \n", " \n", " \n", + " mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2\n", + " 139\n", + " 60.000967\n", + " 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ...\n", + " 256\n", + " \n", + " \n", " mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4\n", - " 242.0\n", - " 0.757218\n", - " 0.496729\n", - " 0.304022\n", - " 256.0\n", + " 242\n", + " 64.062582\n", + " 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ...\n", + " 256\n", " \n", " \n", " mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2\n", - " 892.0\n", - " 0.713227\n", - " 0.470135\n", - " 0.366797\n", - " 256.0\n", + " 892\n", + " 64.583819\n", + " 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ...\n", + " 256\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Size (MB) ROUGE-1 \\\n", - "mesolitica/finetune-noisy-translation-t5-small-... 242.0 0.757218 \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 892.0 0.713227 \n", + " Size (MB) BLEU \\\n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 139 60.000967 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 242 64.062582 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 892 64.583819 \n", "\n", - " ROUGE-2 ROUGE-L \\\n", - "mesolitica/finetune-noisy-translation-t5-small-... 0.496729 0.304022 \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 0.470135 0.366797 \n", + " SacreBLEU Verbose \\\n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ... \n", + "mesolitica/finetune-noisy-translation-t5-small-... 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ... \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ... \n", "\n", - " Suggested length \n", - "mesolitica/finetune-noisy-translation-t5-small-... 256.0 \n", - "mesolitica/finetune-noisy-translation-t5-base-b... 256.0 " + " Suggested length \n", + "mesolitica/finetune-noisy-translation-t5-tiny-b... 256 \n", + "mesolitica/finetune-noisy-translation-t5-small-... 256 \n", + "mesolitica/finetune-noisy-translation-t5-base-b... 256 " ] }, "execution_count": 3, @@ -151,7 +158,7 @@ "\n", "```python\n", "def huggingface(\n", - " model: str = 'mesolitica/finetune-normalizer-t5-small-standard-bahasa-cased',\n", + " model: str = 'mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4',\n", " force_check: bool = True,\n", " use_rules_normalizer: bool = True,\n", " kenlm_model: str = 'bahasa-wiki-news',\n", @@ -170,7 +177,7 @@ "\n", " Parameters\n", " ----------\n", - " model: str, optional (default='mesolitica/finetune-normalizer-t5-small-standard-bahasa-cased')\n", + " model: str, optional (default='mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4')\n", " Check available models at `malaya.normalizer.abstractive.available_huggingface()`.\n", " force_check: bool, optional (default=True)\n", " Force check model one of malaya model.\n", @@ -179,10 +186,12 @@ " kenlm_model: str, optional (default='bahasa-wiki-news')\n", " the model trained on `malaya.language_model.kenlm(model = 'bahasa-wiki-news')`,\n", " but you can use any kenlm model from `malaya.language_model.available_kenlm`.\n", + " Also you can pass as None to skip spelling correction but still apply rules normalizer.\n", " This parameter will be ignored if `use_rules_normalizer=False`.\n", " stem_model: str, optional (default='noisy')\n", " the model trained on `malaya.stem.deep_model(model = 'noisy'),\n", " but you can use any stemmer model from `malaya.stem.available_model`.\n", + " Also you can pass as None to skip stemming but still apply rules normalizer.\n", " This parameter will be ignored if `use_rules_normalizer=False`.\n", " segmenter: Callable, optional (default=None)\n", " segmenter function to segment text, read more at https://malaya.readthedocs.io/en/stable/load-normalizer.html#Use-segmenter\n", @@ -210,6 +219,31 @@ "```" ] }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-12-19 15:05:28.466979: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2022-12-19 15:05:28.472055: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected\n", + "2022-12-19 15:05:28.472078: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: husein-MS-7D31\n", + "2022-12-19 15:05:28.472081: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: husein-MS-7D31\n", + "2022-12-19 15:05:28.472163: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: Not found: was unable to find libcuda.so DSO loaded into this program\n", + "2022-12-19 15:05:28.472205: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.161.3\n" + ] + } + ], + "source": [ + "model_default = malaya.normalizer.abstractive.huggingface(model = 'mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4',)" + ] + }, { "cell_type": "code", "execution_count": 5, @@ -243,22 +277,69 @@ "output_type": "execute_result" } ], + "source": [ + "model_default.generate(['ak tk suka hg'], max_length = 256)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" + ] + }, + { + "data": { + "text/plain": [ + "['Saya tidak suka awak']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "model.generate(['ak tk suka hg'], max_length = 256)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['Dia berada di lorong dalam tetapi dia mahu masuk ke kanan']" + "['Dia berada di lorong dalam tetapi mahu masuk ke kanan']" ] }, - "execution_count": 13, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_default.generate(['Dia kat lane dalam tapi nk masuk kanan'], max_length = 256)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Dia berada di lorong dalam tetapi mahu masuk ke kanan']" + ] + }, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -269,7 +350,27 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['@mesyceres Haha terkejut nak keluarkan semua. Idul Fitri lagi.']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_default.generate(['@mesyceres Haha ayookk keluarkan semuanya. Bentar lagi Idul Fitri .'], max_length = 256)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "metadata": { "scrolled": true }, @@ -280,7 +381,7 @@ "['@mesyceres Haha jom keluarkan semuanya. Idul Fitri lagi.']" ] }, - "execution_count": 14, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -291,16 +392,37 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['Hai kawan-kawan! Saya perasan semalam & hari ini ramai yang dapat biskut ni kan? Jadi hari ini saya ingin berkongsi beberapa post mortem batch pertama kami:']" + "['Hai semua! Saya perasan semalam & hari ni ramai yang dapat biskut ni kan? Jadi hari ini saya ingin berkongsi beberapa post mortem batch pertama kami:']" ] }, - "execution_count": 15, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_default.generate(['Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:'],\n", + " max_length = 256)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Hai semua! Saya perasan semalam & hari ni ramai yang dapat biskut ni kan? Jadi hari ini saya ingin berkongsi beberapa post mortem batch pertama kami:']" + ] + }, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -312,25 +434,66 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['Tak guna budak zaman sekarang ni, nak gosok baju pun kena belajar tiktok.']" + "['Tak guna budak zaman sekarang ni, suruh gosok baju pun kena belajar kat toktok.']" ] }, - "execution_count": 16, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s = 'Punyalah tak guna bebudak zaman sekarang ni sampaikan gosok baju pun kena belajar kat tiktok.'\n", + "model_default.generate([s], max_length = 256)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Tak guna budak zaman sekarang ni, kalau gosok baju pun kena belajar kat tiktok.']" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ "model.generate([s], max_length = 256)" ] }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Sepi juga bila adik beradik dah kahwin / kahwin']" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s = 'Lonely jugak ye when your siblings are married/ getting married'\n", + "model_default.generate([s], max_length = 256)" + ] + }, { "cell_type": "code", "execution_count": 17, @@ -339,7 +502,7 @@ { "data": { "text/plain": [ - "['Sepi juga apabila adik-beradik anda sudah berkahwin/berkahwin']" + "['Sepi juga bila adik beradik dah kahwin/kahwin']" ] }, "execution_count": 17, @@ -348,7 +511,6 @@ } ], "source": [ - "s = 'Lonely jugak ye when your siblings are married/ getting married'\n", "model.generate([s], max_length = 256)" ] }, @@ -360,7 +522,7 @@ { "data": { "text/plain": [ - "['Rasa janggal bila tengok kerajaan cepat buat kerja masa bencana. Dengan mesin yang ada, ia sebenarnya boleh. Ini adalah permulaan yang baik.']" + "['Rasa janggal bila tengok kerajaan cepat buat kerja bencana. Dengan mesin yang ada, ia sebenarnya boleh. Ini adalah permulaan yang baik.']" ] }, "execution_count": 18, @@ -370,6 +532,26 @@ ], "source": [ "s = 'Rasa awkward bila tengok kerajaan laju buat kerja time bencana. With the machineries yang ada, sebenarnya boleh je. This is a good start.'\n", + "model_default.generate([s], max_length = 256)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Rasa janggal bila tengok kerajaan cepat buat kerja bencana. Dengan mesin yang ada, ia sebenarnya boleh. Ini adalah permulaan yang baik.']" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ "model.generate([s], max_length = 256)" ] }, @@ -381,7 +563,7 @@ { "data": { "text/plain": [ - "['kalau boleh elakkan ada perkataan macam ni dalam resume. Bukan sebab apa, sebelum kita pergi ke fasa temuduga, dekat resume kita sendiri, kita di sini untuk memberi kesan yang baik untuk menunjukkan siapa kita sebenarnya']" + "['kalau boleh elakkan ada perkataan macam ni dalam resume. Bukan sebab apa, sebelum kita pergi fasa temuduga, dekat resume sendiri kita kena bagi kesan yang baik untuk menunjukkan siapa kita sebenarnya.']" ] }, "execution_count": 20, @@ -391,7 +573,7 @@ ], "source": [ "s = 'kalau boleh elakkan ada perkataan macam ni dalam resume. Bukan sebab apa, sebelum kita ke fasa interview, dekat resume tu sendiri kita dah kene bagi good impression menunjukkan siapa diri kita yang sebenarnya'\n", - "model.generate([s], max_length = 256)" + "model_default.generate([s], max_length = 256)" ] }, { @@ -402,7 +584,7 @@ { "data": { "text/plain": [ - "['Sudah, bro.. berehat dari politik supaya orang tidak terus dibenci dengan kenyataan yang pelik dan mengelirukan...']" + "['kalau boleh elakkan ada perkataan macam ni dalam resume. Bukan sebab apa, sebelum kita pergi fasa temuduga, dekat resume sendiri kita dah dapat kesan yang baik menunjukkan siapa diri kita yang sebenar']" ] }, "execution_count": 21, @@ -410,8 +592,48 @@ "output_type": "execute_result" } ], + "source": [ + "model.generate([s], max_length = 256)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Sudah cukup, puan... berehatlah dari politik agar tidak terus dibenci orang dengan kenyataan pelik dan mengelirukan...']" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "s = 'Udah2 lah ayoh cik...berehatlah dari politik agar tidak berterusan dibenci orang dgn kenyataan yg pelik dan mengelirukan...'\n", + "model_default.generate([s], max_length = 256)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Itu sahaja, puan... berehatlah dari politik agar tidak terus dibenci orang dengan kenyataan pelik dan mengelirukan...']" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ "model.generate([s], max_length = 256)" ] }, @@ -432,9 +654,9 @@ { "data": { "text/plain": [ - "['Sudah ayoh cik... rehat dari politik supaya tidak terus dibenci orang dengan kenyataan yang pelik dan mengelirukan...',\n", - " 'Sudah-Sudahlah bro.. berehat dari politik supaya orang tidak terus dibenci dengan kenyataan yang pelik dan mengelirukan...',\n", - " 'Itu sahaja.. berehat dari politik supaya tidak terus dibenci orang dengan kenyataan yang pelik dan mengelirukan...']" + "['Ayuh cik... rehat dari politik supaya tidak terus dibenci orang dengan kenyataan yang pelik dan mengelirukan...',\n", + " 'Itu sahaja, puan... rehatkan politik supaya tidak terus dibenci orang dengan kenyataan pelik dan mengelirukan...',\n", + " 'Dah tu pakcik... berehat dari politik supaya orang tidak terus membencinya dengan kenyataan pelik dan mengelirukan...']" ] }, "execution_count": 24, @@ -450,18 +672,18 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['Sudah, bro.. berehat dari politik supaya tidak terus dibenci orang dengan kenyataan yang pelik dan mengelirukan...',\n", - " 'Itu sahaja bro.. rehat daripada politik supaya tidak dibenci orang dengan kenyataan yang pelik dan mengelirukan...',\n", - " 'Itu sahaja bro... berehatlah dari politik supaya orang tidak terus dibenci dengan kenyataan yang pelik dan mengelirukan...']" + "['Itu sahaja, puan... rehat politik supaya tidak terus dibenci oleh orang dengan kenyataan pelik dan mengelirukan...',\n", + " 'Sudah, ayuh cik.. berehat dari politik supaya tidak terus dibenci orang dengan kenyataan pelik dan mengelirukan...',\n", + " 'Itu sahaja, makcik... rehat dari politik supaya tidak terus dibenci orang dengan kenyataan yang pelik dan mengelirukan...']" ] }, - "execution_count": 27, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -470,6 +692,13 @@ "model.generate([s], max_length = 256, do_sample=True, penalty_alpha=0.6, top_k=4, temperature = 0.7,\n", " num_return_sequences=3)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/malaya/augmentation/abstractive.py b/malaya/augmentation/abstractive.py index 434cf000..7ad6cf56 100644 --- a/malaya/augmentation/abstractive.py +++ b/malaya/augmentation/abstractive.py @@ -8,20 +8,20 @@ _huggingface_availability = { 'mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2': { 'Size (MB)': 139, - 'BLEU': 41.625536185056305, - 'SacreBLEU Verbose': '73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 hyp_len = 21400 ref_len = 22027)', + 'BLEU': 60.0009672168891, + 'SacreBLEU Verbose': '77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 hyp_len = 110970 ref_len = 107150)', 'Suggested length': 256, }, 'mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4': { 'Size (MB)': 242, - 'BLEU': 41.625536185056305, - 'SacreBLEU Verbose': '73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 hyp_len = 21400 ref_len = 22027)', + 'BLEU': 64.06258219941243, + 'SacreBLEU Verbose': '80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 hyp_len = 111635 ref_len = 107150)', 'Suggested length': 256, }, 'mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2': { 'Size (MB)': 892, - 'BLEU': 41.625536185056305, - 'SacreBLEU Verbose': '73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 hyp_len = 21400 ref_len = 22027)', + 'BLEU': 64.583819005204, + 'SacreBLEU Verbose': '80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 hyp_len = 112260 ref_len = 107150)', 'Suggested length': 256, }, } diff --git a/malaya/dependency.py b/malaya/dependency.py index 0e5ef688..8bc80e4a 100644 --- a/malaya/dependency.py +++ b/malaya/dependency.py @@ -13,6 +13,7 @@ from malaya.function import describe_availability from herpetologist import check_type import logging +import warnings logger = logging.getLogger(__name__) @@ -243,6 +244,8 @@ def available_transformer(version: str = 'v2'): * ``'v2'`` - Trained on bigger dataset, better version. """ + warnings.warn( + '`malaya.dependency.available_transformer` is deprecated, use `malaya.dependency.available_huggingface` instead', DeprecationWarning) _describe() return describe_availability(_transformer_availability[_validate_version(version)]) @@ -283,7 +286,8 @@ def transformer(version: str = 'v2', model: str = 'xlnet', quantized: bool = Fal * if `bert` in model, will return `malaya.model.bert.DependencyBERT`. * if `xlnet` in model, will return `malaya.model.xlnet.DependencyXLNET`. """ - + warnings.warn( + '`malaya.dependency.transformer` is deprecated, use `malaya.dependency.huggingface` instead', DeprecationWarning) logger.warning( '`malaya.dependency.transformer` trained on indonesian dataset and augmented dataset, not an actual malay dataset.') diff --git a/malaya/normalizer/abstractive.py b/malaya/normalizer/abstractive.py index 1db5187c..a3ba0699 100644 --- a/malaya/normalizer/abstractive.py +++ b/malaya/normalizer/abstractive.py @@ -8,20 +8,20 @@ _huggingface_availability = { 'mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2': { 'Size (MB)': 139, - 'BLEU': 41.625536185056305, - 'SacreBLEU Verbose': '73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 hyp_len = 21400 ref_len = 22027)', + 'BLEU': 60.0009672168891, + 'SacreBLEU Verbose': '77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 hyp_len = 110970 ref_len = 107150)', 'Suggested length': 256, }, 'mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4': { 'Size (MB)': 242, - 'BLEU': 41.625536185056305, - 'SacreBLEU Verbose': '73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 hyp_len = 21400 ref_len = 22027)', + 'BLEU': 64.06258219941243, + 'SacreBLEU Verbose': '80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 hyp_len = 111635 ref_len = 107150)', 'Suggested length': 256, }, 'mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2': { 'Size (MB)': 892, - 'BLEU': 41.625536185056305, - 'SacreBLEU Verbose': '73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 hyp_len = 21400 ref_len = 22027)', + 'BLEU': 64.583819005204, + 'SacreBLEU Verbose': '80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 hyp_len = 112260 ref_len = 107150)', 'Suggested length': 256, }, } @@ -65,10 +65,12 @@ def huggingface( kenlm_model: str, optional (default='bahasa-wiki-news') the model trained on `malaya.language_model.kenlm(model = 'bahasa-wiki-news')`, but you can use any kenlm model from `malaya.language_model.available_kenlm`. + Also you can pass as None to skip spelling correction but still apply rules normalizer. This parameter will be ignored if `use_rules_normalizer=False`. stem_model: str, optional (default='noisy') the model trained on `malaya.stem.deep_model(model = 'noisy'), but you can use any stemmer model from `malaya.stem.available_model`. + Also you can pass as None to skip stemming but still apply rules normalizer. This parameter will be ignored if `use_rules_normalizer=False`. segmenter: Callable, optional (default=None) segmenter function to segment text, read more at https://malaya.readthedocs.io/en/stable/load-normalizer.html#Use-segmenter @@ -97,10 +99,17 @@ def huggingface( from malaya.language_model import kenlm from malaya.stem import deep_model - from malaya.spelling_correction.probability import load_spelling + from malaya.spelling_correction.probability import load as load_spelling + from malaya.normalizer.rules import load as load_normalizer - lm = kenlm(model=kenlm_model) - stemmer = deep_model(model=stem_model) + if isinstance(kenlm_model, str): + lm = kenlm(model=kenlm_model) + else: + lm = None + if isinstance(stem_model, str): + stemmer = deep_model(model=stem_model) + else: + stemmer = None corrector = load_spelling( language_model=lm, replace_augmentation=replace_augmentation, @@ -108,13 +117,14 @@ def huggingface( maxlen=maxlen_speller, minlen=minlen_speller, ) + normalizer = load_normalizer(corrector, stemmer, date=False) else: - corrector = None + normalizer = None return load_huggingface.load_normalizer( model=model, initial_text='terjemah pasar Melayu ke Melayu: ', - corrector=corrector, + normalizer=normalizer, segmenter=segmenter, text_scorer=text_scorer, **kwargs, diff --git a/malaya/normalizer/rules.py b/malaya/normalizer/rules.py index 1b0d66a7..411bc494 100644 --- a/malaya/normalizer/rules.py +++ b/malaya/normalizer/rules.py @@ -236,8 +236,8 @@ def normalize( translator: Callable = None, language_detection_word: Callable = None, acceptable_language_detection: List[str] = ['EN', 'CAPITAL', 'NOT_LANG'], - segmenter: Callable = None, - text_scorer: Callable = None, + segmenter=None, + text_scorer=None, text_scorer_window: int = 2, not_a_word_threshold: float = 1e-4, dateparser_settings={'TIMEZONE': 'GMT+8'}, diff --git a/malaya/supervised/huggingface.py b/malaya/supervised/huggingface.py index e87bd89f..2b1fa36d 100644 --- a/malaya/supervised/huggingface.py +++ b/malaya/supervised/huggingface.py @@ -81,7 +81,7 @@ def load_tatabahasa(model, initial_text, **kwargs): def load_normalizer( model, initial_text, - corrector, + normalizer, segmenter=None, text_scorer=None, **kwargs, @@ -89,7 +89,7 @@ def load_normalizer( return Normalizer( model, initial_text, - corrector, + normalizer, segmenter=segmenter, text_scorer=text_scorer, **kwargs, diff --git a/malaya/torch_model/huggingface.py b/malaya/torch_model/huggingface.py index 84c552d4..9dcfd2c7 100644 --- a/malaya/torch_model/huggingface.py +++ b/malaya/torch_model/huggingface.py @@ -794,7 +794,7 @@ def __init__( self, model, initial_text, - corrector, + normalizer, segmenter=None, text_scorer=None, **kwargs, @@ -805,7 +805,7 @@ def __init__( initial_text=initial_text, **kwargs, ) - self.corrector = corrector + self.normalizer = normalizer self.segmenter = segmenter self.text_scorer = text_scorer @@ -830,8 +830,24 @@ def generate( ------- result: List[str] """ - if self.corrector is not None: - pass + if self.normalizer is not None: + for i in range(len(strings)): + t = strings[i] + try: + normalized = self.normalizer.normalize( + t, normalize_hingga=False, normalize_cardinal=False, + normalize_ordinal=False, normalize_pada_hari_bulan=False, + normalize_fraction=False, normalize_money=False, normalize_date=False, + normalize_time=False, normalize_ic=False, normalize_units=False, + normalize_url=False, normalize_percent=False, normalize_telephone=False, + text_scorer=self.text_scorer, segmenter=self.segmenter, + not_a_word_threshold=1e-9, + )['normalize'] + logger.debug(f'input: {t}, normalized: {normalized}') + strings[i] = normalized + except Exception as e: + logger.warning(f'input: {t}, exception {e}') + logger.warning(f'input: {t}, `self.normalizer` exception, skip to normalize.') return super().generate(strings, **kwargs) diff --git a/malaya/translation/en_ms.py b/malaya/translation/en_ms.py index bc881ae8..262b632b 100644 --- a/malaya/translation/en_ms.py +++ b/malaya/translation/en_ms.py @@ -145,22 +145,22 @@ }, 'mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2': { 'Size (MB)': 139, - 'BLEU': 41.625536185056305, - 'SacreBLEU Verbose': '73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 hyp_len = 21400 ref_len = 22027)', + 'BLEU': 60.0009672168891, + 'SacreBLEU Verbose': '77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 hyp_len = 110970 ref_len = 107150)', 'SacreBLEU-chrF++-FLORES200': None, 'Suggested length': 256, }, 'mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4': { 'Size (MB)': 242, - 'BLEU': 41.625536185056305, - 'SacreBLEU Verbose': '73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 hyp_len = 21400 ref_len = 22027)', + 'BLEU': 64.06258219941243, + 'SacreBLEU Verbose': '80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 hyp_len = 111635 ref_len = 107150)', 'SacreBLEU-chrF++-FLORES200': None, 'Suggested length': 256, }, 'mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2': { 'Size (MB)': 892, - 'BLEU': 41.625536185056305, - 'SacreBLEU Verbose': '73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 hyp_len = 21400 ref_len = 22027)', + 'BLEU': 64.583819005204, + 'SacreBLEU Verbose': '80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 hyp_len = 112260 ref_len = 107150)', 'SacreBLEU-chrF++-FLORES200': None, 'Suggested length': 256, }, @@ -169,7 +169,7 @@ def _describe(): logger.info('tested on FLORES200 EN-MS (eng_Latn-zsm_Latn) pair `dev` set, https://github.com/facebookresearch/flores/tree/main/flores200') - logger.info('for noisy, tested on noisy augmented FLORES200 EN-MS (eng_Latn-zsm_Latn) pair `dev` set, https://github.com/huseinzol05/malay-dataset/tree/master/translation/nllb-noisy-dev-augmentation') + logger.info('for noisy, tested on noisy twitter google translation, https://huggingface.co/datasets/mesolitica/augmentation-test-set') def available_transformer(): diff --git a/malaya/translation/ms_en.py b/malaya/translation/ms_en.py index bfb034ae..14eabf08 100644 --- a/malaya/translation/ms_en.py +++ b/malaya/translation/ms_en.py @@ -135,7 +135,7 @@ 'Suggested length': 256, }, 'mesolitica/finetune-noisy-translation-t5-base-bahasa-cased': { - 'Size (MB)': 242, + 'Size (MB)': 892, 'BLEU': 43.432723192596406, 'SacreBLEU Verbose': '71.8/49.8/36.6/27.2 (BP = 1.000 ratio = 1.000 hyp_len = 92982 ref_len = 92985)', 'SacreBLEU-chrF++-FLORES200': None, @@ -143,22 +143,22 @@ }, 'mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2': { 'Size (MB)': 139, - 'BLEU': 41.625536185056305, - 'SacreBLEU Verbose': '73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 hyp_len = 21400 ref_len = 22027)', + 'BLEU': 60.0009672168891, + 'SacreBLEU Verbose': '77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 hyp_len = 110970 ref_len = 107150)', 'SacreBLEU-chrF++-FLORES200': None, 'Suggested length': 256, }, 'mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4': { 'Size (MB)': 242, - 'BLEU': 41.625536185056305, - 'SacreBLEU Verbose': '73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 hyp_len = 21400 ref_len = 22027)', + 'BLEU': 64.06258219941243, + 'SacreBLEU Verbose': '80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 hyp_len = 111635 ref_len = 107150)', 'SacreBLEU-chrF++-FLORES200': None, 'Suggested length': 256, }, 'mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2': { 'Size (MB)': 892, - 'BLEU': 41.625536185056305, - 'SacreBLEU Verbose': '73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 hyp_len = 21400 ref_len = 22027)', + 'BLEU': 64.583819005204, + 'SacreBLEU Verbose': '80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 hyp_len = 112260 ref_len = 107150)', 'SacreBLEU-chrF++-FLORES200': None, 'Suggested length': 256, }, diff --git a/session/translation/noisy-hf-t5/README.md b/session/translation/noisy-hf-t5/README.md index 904a12e9..653c1c31 100644 --- a/session/translation/noisy-hf-t5/README.md +++ b/session/translation/noisy-hf-t5/README.md @@ -1,9 +1,12 @@ -Original script, https://github.com/huggingface/transformers/blob/v4.21.2/examples/pytorch/translation/run_translation.py +# HuggingFace T5 -``` -shuf train-noisy.json > train-noisy-shuffled.json -shuf test-noisy.json > test-noisy-shuffled.json -``` +## how-to + +1. Run prepare dataset, [prepare-data.ipynb](prepare-data.ipynb). + +2. Run training script, + +Original script, https://github.com/huggingface/transformers/blob/v4.21.2/examples/pytorch/translation/run_translation.py BASE model, ``` diff --git a/session/translation/noisy-hf-t5/export/base.ipynb b/session/translation/noisy-hf-t5/export/base.ipynb index 203cb70b..58b092bc 100644 --- a/session/translation/noisy-hf-t5/export/base.ipynb +++ b/session/translation/noisy-hf-t5/export/base.ipynb @@ -8,25 +8,34 @@ "source": [ "import os\n", "\n", - "os.environ['CUDA_VISIBLE_DEVICES'] = ''" + "os.environ['CUDA_VISIBLE_DEVICES'] = '1'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, + "outputs": [], + "source": [ + "from transformers import T5Tokenizer, T5ForConditionalGeneration\n", + "\n", + "tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['finetune-t5-base-noisy-bahasa-cased/checkpoint-230000',\n", - " 'finetune-t5-base-noisy-bahasa-cased/checkpoint-240000',\n", - " 'finetune-t5-base-noisy-bahasa-cased/checkpoint-250000',\n", - " 'finetune-t5-base-noisy-bahasa-cased/checkpoint-260000',\n", - " 'finetune-t5-base-noisy-bahasa-cased/checkpoint-270000']" + "['finetune-t5-base-standard-bahasa-cased-combined/checkpoint-640000',\n", + " 'finetune-t5-base-standard-bahasa-cased-combined/checkpoint-650000',\n", + " 'finetune-t5-base-standard-bahasa-cased-combined/checkpoint-660000']" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -34,328 +43,528 @@ "source": [ "from glob import glob\n", "\n", - "checkpoints = sorted(glob('finetune-t5-base-noisy-bahasa-cased/checkpoint-*'))\n", + "checkpoints = sorted(glob('finetune-t5-base-standard-bahasa-cased-combined/checkpoint-*'))\n", "checkpoints" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "from transformers import T5Tokenizer, T5ForConditionalGeneration\n", - "\n", - "tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')\n", "model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# model2 = T5ForConditionalGeneration.from_pretrained('mesolitica/finetune-translation-t5-small-standard-bahasa-cased')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " Hi guys! I noticed yesterday and today many of these cookies are available. So today I want to share some post mortem of our first batch:\n" + " Hi guys! I noticed yesterday & today many people have got these cookies, right? So today I want to share some post mortem of our first batch:\n" ] } ], "source": [ "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')\n", "outputs = model.generate(input_ids, max_length = 100)\n", - "print(tokenizer.decode(outputs[0]))" + "print(tokenizer.decode(outputs[0]))\n" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " Hai kawan! Saya perhatikan semalam & harini ramai yang dapat cookies ni kan. Jadi harini saya nak kongsi beberapa post mortem kumpulan pertama kami:\n" + " Can you solve it?\n" ] } ], "source": [ - "input_ids = tokenizer.encode('terjemah Inggeris ke Melayu: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')\n", + "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: mesolitica boleh buat asr tak', return_tensors = 'pt')\n", "outputs = model.generate(input_ids, max_length = 100)\n", - "print(tokenizer.decode(outputs[0]))" + "print(tokenizer.decode(outputs[0]))\n" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " I don't understand la.\n", - " At 8 a.m., the market is a lot of people, so he's a good person.\n", - " So it's fucking shit.\n", - " Where are you going?\n", - " It's like taking half a day.\n", - " Imagine PH and win pru-14. Passovers are all kinds of back doors. Last-last Ismail Sabri goes up. That's why I don't give a fuck about politics anymore. I swear I'm up.\n" + " Brother, I want to copy it on Facebook.. haha. If you can't, I'll\n" ] } ], "source": [ - "strings = [\n", - " 'ak tak paham la',\n", - " 'jam 8 di pasar KK memang org ramai 😂, pandai dia pilih tmpt.',\n", - " 'Jadi haram jadah😀😃🤭',\n", - " 'nak gi mana tuu',\n", - " 'Macam nak ambil half day',\n", - " \"Bayangkan PH dan menang pru-14. Pastu macam-macam pintu belakang ada. Last-last Ismail Sabri naik. That's why I don't give a fk about politics anymore. Sumpah dah fk up dah.\",\n", - "]\n", - "for s in strings:\n", - " input_ids = tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors = 'pt')\n", - " outputs = model.generate(input_ids, max_length = 100, )\n", - " print(tokenizer.decode(outputs[0]))" + "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: Bang, aku nak copy masuk kat fb…hahaha. Kalau xleh aku ss', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))\n" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " u ni, cakap betul lah\n", - " pelik jugak dia buat majlis biasa2 je sebab gaya hidup dia dah mewah...................................................................................\n", - " Selepas menonton video ini: mm dapnya burger benjo extra mayo\n", - " Hai kawan! Saya perhatikan semalam & harini ramai yang dapat cookies ni kan. Jadi harini saya nak kongsi beberapa post mortem kumpulan pertama kami:\n" + " Sesungguhnya. Ini bukan pakar, saya tahu. Ia isyarat, bodoh.\n" ] } ], "source": [ - "strings = [\n", - " 'u ni, talk properly lah',\n", - " \"just attended my cousin's wedding. pelik jugak dia buat majlis biasa2 je sebab her lifestyle looks lavish. then i found out they're going on a 3 weeks honeymoon. smart decision 👍\",\n", - " 'Me after seeing this video: mm dapnya burger benjo extra mayo',\n", - " 'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',\n", - "]\n", - "for s in strings:\n", - " input_ids = tokenizer.encode(f'terjemah Inggeris ke Melayu: {s}', return_tensors = 'pt')\n", - " outputs = model.generate(input_ids, max_length = 100)\n", - " print(tokenizer.decode(outputs[0]))" + "input_ids = tokenizer.encode(\"terjemah pasar Melayu ke Melayu: Memanglah. Ini tak payah expert, aku pun tau. It's a gesture, bodoh.\", return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[' rakyat mmg x suka ko pun ', ' org melaka pun tak suka kau', ' rakyat memang x suka kau pun ']\n" + ] + } + ], "source": [ - "model.push_to_hub('finetune-noisy-translation-t5-base-bahasa-cased', organization='mesolitica')" + "input_ids = tokenizer.encode(\"terjemah Melayu ke pasar Melayu: rakyat memang tak suka awak pun\", return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100, do_sample=True, \n", + " top_k=50, \n", + " top_p=0.95, \n", + " temperature=0.7,\n", + " num_return_sequences=3)\n", + "print(tokenizer.batch_decode(outputs))" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": false - }, - "outputs": [], + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Perbincangan khas juga bertujuan untuk Seri Paduka mendapat pandangan Raja2 Melayu bagi membolehkan Baginda membuat keputusan yang terbaik demi kepentingan dan kesejahteraan Negara serta Rakyat',\n", + " 'Perbincangan khas juga bertujuan agar Baginda mendapat pandangan Raja2 Melayu bagi membolehkan baginda membuat keputusan yang terbaik demi kepentingan dan kesejahteraan negara dan rakyat',\n", + " 'Perbincangan khas itu juga bertujuan utk baginda mendapat pandangan Raja2 Melayu utk membolehkan baginda membuat keputusan terbaik demi kepentingan dan kesejahteraan negara dan rakyat']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "tokenizer.push_to_hub('finetune-noisy-translation-t5-base-bahasa-cased', organization='mesolitica')" + "input_ids = tokenizer.encode(\"terjemah Melayu ke pasar Melayu: Perbincangan khas itu juga bertujuan bagi Seri Paduka mendapat pandangan Raja-Raja Melayu untuk membolehkan baginda membuat keputusan yang terbaik demi kepentingan dan kesejahteraan negara serta rakyat\", return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100, do_sample=True, \n", + " top_k=100, \n", + " top_p=0.95,\n", + " temperature=0.7,\n", + " num_return_sequences=3)\n", + "tokenizer.batch_decode(outputs, skip_special_tokens=True)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['Saya menulis untuk memohon jawatan Senior Software Engineer di [Company]. Sebagai seorang software engineer yang sangat mahir dan berpengalaman dengan kepakaran dalam seni bina data besar dan pemprosesan bahasa semulajadi, saya percaya saya akan menjadi tambahan yang berharga untuk pasukan anda',\n", + " 'Saya menulis untuk memohon jawatan Senior Software Engineer di [Company]. Sebagai seorang jurutera perisian yang sangat mahir dan berpengalaman dengan kepakaran dalam seni bina data besar dan pemprosesan bahasa semulajadi, saya percaya saya akan menjadi penambahan yang berharga kepada pasukan anda',\n", + " 'Saya menulis untuk memohon jawatan Senior Software Engineer di [Company]. Sebagai seorang software engineer yang sangat mahir dan berpengalaman dengan pengkhususan dalam seni bina data besar dan pemprosesan bahasa semulajadi, saya percaya saya akan menjadi penambahan berharga kepada pasukan anda']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "!cp -r finetune-t5-base-noisy-bahasa-cased/runs finetune-noisy-translation-t5-base-bahasa-cased\n", - "!cd finetune-noisy-translation-t5-base-bahasa-cased && git add . && git commit -m 'add tensorboard' && git push" + "input_ids = tokenizer.encode('terjemah Inggeris ke pasar Melayu: I am writing to apply for the Senior Software Engineer position at [Company]. As a highly skilled and experienced software engineer with a specialization in big data architecture and natural language processing, I believe I would be a valuable addition to your team', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100, do_sample=True, \n", + " top_k=100, \n", + " top_p=0.95, temperature=0.7,\n", + " num_return_sequences=3)\n", + "tokenizer.batch_decode(outputs, skip_special_tokens=True)" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['nous', 'can', 'do', 'asr', 'not?']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "from sacrebleu.metrics import BLEU, CHRF, TER\n", - "\n", - "bleu = BLEU()\n", - "chrf = CHRF(word_order = 2)" + "t = 'nous bleh buat asr tak?'\n", + "input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors='pt')[\n", + " 0]} for s in t.split()]\n", + "padded = tokenizer.pad(input_ids, padding='longest')\n", + "outputs = model.generate(**padded, max_length = 50)\n", + "tokenizer.batch_decode(outputs, skip_special_tokens=True)" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " asr\n" + ] + } + ], + "source": [ + "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: asr', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Is there a promo?\n" + ] + } + ], + "source": [ + "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: ada promo x?', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " mesolitica can b\n" + ] + } + ], + "source": [ + "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: mesolitica bleh b', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "6854" + "['can', 'do', 'asr', 'not']" ] }, - "execution_count": 9, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from unidecode import unidecode\n", - "import json\n", - "\n", - "with open('test-noisy-shuffled.json') as fopen:\n", - " test = fopen.read().split('\\n')\n", - " test = [json.loads(t) for t in test if len(t)]\n", - " \n", - "len(test)" + "t = 'bleh buat asr tak'\n", + "input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors='pt')[\n", + " 0]} for s in t.split()]\n", + "padded = tokenizer.pad(input_ids, padding='longest')\n", + "outputs = model.generate(**padded, max_length = 50)\n", + "tokenizer.batch_decode(outputs, skip_special_tokens=True)\n" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 18, "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "100%|█████████████████████████████████████| 6854/6854 [1:10:04<00:00, 1.63it/s]\n" + " can you make asr or not?\n" ] } ], "source": [ - "from tqdm import tqdm\n", - "\n", - "batch_size = 1\n", - "\n", - "results_en_ms, filtered_right_en_ms = [], []\n", - "results_ms_en, filtered_right_ms_en = [], []\n", - "for i in tqdm(range(len(test))):\n", - " t = test[i]['translation']\n", - " p = t['prefix']\n", - " s = t['src']\n", - " tgt = t['tgt']\n", - " \n", - " input_ids = [{'input_ids': tokenizer.encode(f'{p}{s}', return_tensors = 'pt')[0]}]\n", - " padded = tokenizer.pad(input_ids, padding = 'longest')\n", - " outputs = model.generate(**padded, max_length = 1000)[0]\n", - " o = tokenizer.decode(outputs, skip_special_tokens=True)\n", - " if len(o):\n", - " if 'Inggeris ke Melayu' in p:\n", - " results_en_ms.append(o)\n", - " filtered_right_en_ms.append(tgt)\n", - " else:\n", - " results_ms_en.append(o)\n", - " filtered_right_ms_en.append(tgt)" + "input_ids = tokenizer.encode(f'terjemah Melayu ke Inggeris: {t}', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(2937, 3917)" + "[\"I don't understand\",\n", + " 'Hi guys! I noticed yesterday & today many people have got these cookies, right? So today I want to share some post mortem of our first batch:',\n", + " \"Indeed. This doesn't bother experts, I know too. It's a gesture, stupid.\",\n", + " \"at 8 o'clock at the KK market it's really crowded, he's good at choosing a place.\",\n", + " 'So haram jadah',\n", + " 'where do you want to go?',\n", + " \"It's like taking half a day\",\n", + " \"Imagine PAKATAN HARAPAN and win pru-14. After that, there are all kinds of back doors. Ismail Sabri went up last. That's why I don't give a fk about politics anymore. I swear it's already up.\"]" ] }, - "execution_count": 11, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "len(results_en_ms), len(results_ms_en)" + "strings = [\n", + " 'ak tak paham la',\n", + " 'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',\n", + " \"Memanglah. Ini tak payah expert, aku pun tau. It's a gesture, bodoh.\",\n", + " 'jam 8 di pasar KK memang org ramai 😂, pandai dia pilih tmpt.',\n", + " 'Jadi haram jadah😀😃🤭',\n", + " 'nak gi mana tuu',\n", + " 'Macam nak ambil half day',\n", + " \"Bayangkan PH dan menang pru-14. Pastu macam-macam pintu belakang ada. Last-last Ismail Sabri naik. That's why I don't give a fk about politics anymore. Sumpah dah fk up dah.\",\n", + "]\n", + "input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors='pt')[\n", + " 0]} for s in strings]\n", + "padded = tokenizer.pad(input_ids, padding='longest')\n", + "outputs = model.generate(**padded, max_length = 100)\n", + "tokenizer.batch_decode(outputs, skip_special_tokens=True)" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "({'name': 'BLEU',\n", - " 'score': 42.16321973536871,\n", - " '_mean': -1.0,\n", - " '_ci': -1.0,\n", - " '_verbose': '73.4/50.1/35.7/25.8 (BP = 0.982 ratio = 0.982 hyp_len = 63335 ref_len = 64473)',\n", - " 'bp': 0.9821925128801015,\n", - " 'counts': [46490, 30266, 20534, 14086],\n", - " 'totals': [63335, 60398, 57461, 54524],\n", - " 'sys_len': 63335,\n", - " 'ref_len': 64473,\n", - " 'precisions': [73.40333149127655,\n", - " 50.11093082552402,\n", - " 35.7355423678669,\n", - " 25.834494901327854],\n", - " 'prec_str': '73.4/50.1/35.7/25.8',\n", - " 'ratio': 0.9823492004404945},\n", - " chrF2++ = 66.51)" + "['Ini awak, cakap betul-betul',\n", + " 'baru menghadiri majlis perkahwinan sepupu saya. Peliknya dia hanya mengadakan majlis biasa kerana gaya hidupnya kelihatan mewah. kemudian saya mendapat tahu mereka akan pergi pada bulan madu selama 3 minggu. keputusan yang bijak',\n", + " 'Saya selepas melihat video ini: burger benjo extra mayo memang sedap',\n", + " 'Hai kawan-kawan! Saya perasan semalam & hari ini ramai yang dapat biskut ni kan? Jadi hari ini saya ingin berkongsi beberapa post mortem batch pertama kami:']" ] }, - "execution_count": 12, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "refs = [filtered_right_en_ms]\n", - "sys = results_en_ms\n", - "r = bleu.corpus_score(sys, refs)\n", - "r.__dict__, chrf.corpus_score(sys, refs)" + "strings = [\n", + " 'u ni, talk properly lah',\n", + " \"just attended my cousin's wedding. pelik jugak dia buat majlis biasa2 je sebab her lifestyle looks lavish. then i found out they're going on a 3 weeks honeymoon. smart decision 👍\",\n", + " 'Me after seeing this video: mm dapnya burger benjo extra mayo',\n", + " 'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',\n", + "]\n", + "input_ids = [{'input_ids': tokenizer.encode(f'terjemah pasar Melayu ke Melayu: {s}', return_tensors='pt')[\n", + " 0]} for s in strings]\n", + "padded = tokenizer.pad(input_ids, padding='longest')\n", + "outputs = model.generate(**padded, max_length = 100)\n", + "tokenizer.batch_decode(outputs, skip_special_tokens=True)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 21, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/husein/.local/lib/python3.8/site-packages/transformers/utils/hub.py:651: UserWarning: The `organization` argument is deprecated and will be removed in v5 of Transformers. Set your organization directly in the `repo_id` passed instead (`repo_id={organization}/{model_id}`).\n", + " warnings.warn(\n" + ] + }, { "data": { "text/plain": [ - "({'name': 'BLEU',\n", - " 'score': 43.432723192596406,\n", - " '_mean': -1.0,\n", - " '_ci': -1.0,\n", - " '_verbose': '71.8/49.8/36.6/27.2 (BP = 1.000 ratio = 1.000 hyp_len = 92982 ref_len = 92985)',\n", - " 'bp': 0.999967736211266,\n", - " 'counts': [66716, 44323, 31152, 22130],\n", - " 'totals': [92982, 89065, 85148, 81231],\n", - " 'sys_len': 92982,\n", - " 'ref_len': 92985,\n", - " 'precisions': [71.75152179991827,\n", - " 49.76477853253242,\n", - " 36.58570958801146,\n", - " 27.243293816400143],\n", - " 'prec_str': '71.8/49.8/36.6/27.2',\n", - " 'ratio': 0.9999677367317309},\n", - " chrF2++ = 65.52)" + "CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2/commit/569bdfc042a26d6b4f35b7c0ce6cb977d5a799ac', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='569bdfc042a26d6b4f35b7c0ce6cb977d5a799ac', pr_url=None, pr_revision=None, pr_num=None)" ] }, - "execution_count": 13, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.push_to_hub('finetune-noisy-translation-t5-base-bahasa-cased-v2', organization='mesolitica')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2/commit/565abb8debe26ead2a002c5ad8cf37c015f94f42', commit_message='Upload tokenizer', commit_description='', oid='565abb8debe26ead2a002c5ad8cf37c015f94f42', pr_url=None, pr_revision=None, pr_num=None)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.push_to_hub('finetune-noisy-translation-t5-base-bahasa-cased-v2', organization='mesolitica')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "_ = model.cuda()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "5000it [12:17, 6.78it/s]\n" + ] + } + ], + "source": [ + "from tqdm import tqdm\n", + "import json\n", + "\n", + "filtered_left, filtered_right = [], []\n", + "\n", + "with open('shuffled-test.json') as fopen:\n", + " for l in tqdm(fopen):\n", + " data = json.loads(l)['translation']\n", + " p = data['prefix']\n", + " src = data['src']\n", + " input_ids = [{'input_ids': tokenizer.encode(f'{p}: {src}', return_tensors = 'pt')[0]}]\n", + " padded = tokenizer.pad(input_ids, padding = 'longest')\n", + " for k in padded.keys():\n", + " padded[k] = padded[k].cuda()\n", + " outputs = model.generate(**padded, max_length = 256)\n", + " filtered_left.append(tokenizer.decode(outputs[0], skip_special_tokens=True))\n", + " filtered_right.append(data['tgt'])" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "refs = [filtered_right]\n", + "sys = filtered_left" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "from sacrebleu.metrics import BLEU, CHRF, TER\n", + "\n", + "bleu = BLEU()\n", + "chrf = CHRF(word_order = 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'name': 'BLEU',\n", + " 'score': 64.583819005204,\n", + " '_mean': -1.0,\n", + " '_ci': -1.0,\n", + " '_verbose': '80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 hyp_len = 112260 ref_len = 107150)',\n", + " 'bp': 1.0,\n", + " 'counts': [90014, 73084, 61157, 51798],\n", + " 'totals': [112260, 107260, 102260, 97281],\n", + " 'sys_len': 112260,\n", + " 'ref_len': 107150,\n", + " 'precisions': [80.1835025832888,\n", + " 68.13723662129405,\n", + " 59.805398005085074,\n", + " 53.2457519967928],\n", + " 'prec_str': '80.2/68.1/59.8/53.2',\n", + " 'ratio': 1.047690153989734}" + ] + }, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "refs = [filtered_right_ms_en]\n", - "sys = results_ms_en\n", "r = bleu.corpus_score(sys, refs)\n", - "r.__dict__, chrf.corpus_score(sys, refs)" + "r.__dict__" ] }, { diff --git a/session/translation/noisy-hf-t5/export/small.ipynb b/session/translation/noisy-hf-t5/export/small.ipynb index 54e8dc34..e5b73099 100644 --- a/session/translation/noisy-hf-t5/export/small.ipynb +++ b/session/translation/noisy-hf-t5/export/small.ipynb @@ -8,25 +8,62 @@ "source": [ "import os\n", "\n", - "os.environ['CUDA_VISIBLE_DEVICES'] = ''" + "os.environ['CUDA_VISIBLE_DEVICES'] = '0'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import T5Tokenizer, T5ForConditionalGeneration\n", + "\n", + "tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# count = []\n", + "# with open('shuffled-train.json') as fopen:\n", + "# for l in tqdm(fopen):\n", + "# data = json.loads(l)\n", + "# src = data['translation']['src']\n", + "# tgt = data['translation']['tgt']\n", + "# if 'promo' in src or 'promo' in tgt:\n", + "# count.append(data)\n", + " \n", + "# count" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['finetune-t5-small-noisy-bahasa-cased/checkpoint-150000',\n", - " 'finetune-t5-small-noisy-bahasa-cased/checkpoint-160000',\n", - " 'finetune-t5-small-noisy-bahasa-cased/checkpoint-170000',\n", - " 'finetune-t5-small-noisy-bahasa-cased/checkpoint-180000',\n", - " 'finetune-t5-small-noisy-bahasa-cased/checkpoint-190000']" + "['finetune-t5-small-standard-bahasa-cased-combined/checkpoint-680000',\n", + " 'finetune-t5-small-standard-bahasa-cased-combined/checkpoint-690000',\n", + " 'finetune-t5-small-standard-bahasa-cased-combined/checkpoint-700000']" ] }, - "execution_count": 2, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -34,32 +71,29 @@ "source": [ "from glob import glob\n", "\n", - "checkpoints = sorted(glob('finetune-t5-small-noisy-bahasa-cased/checkpoint-*'))\n", + "checkpoints = sorted(glob('finetune-t5-small-standard-bahasa-cased-combined/checkpoint-*'))\n", "checkpoints" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "from transformers import T5Tokenizer, T5ForConditionalGeneration\n", - "\n", - "tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')\n", "model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " Hi guys! I noticed yesterday and today many have got these cookies, right? So today I want to share some of our first batch of mortem posts:\n" + " Hi guys! I noticed yesterday & today many people have got these cookies, right? So today I want to share some post mortem of our first batch:\n" ] } ], @@ -71,301 +105,418 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " Hai kawan-kawan! Saya perhatikan semalam & harini dah ramai yang dapat cookies ni kan. Jadi harini saya nak kongsi beberapa post mortem kumpulan pertama kami:\n" + " mesolitica can make asr or not\n" ] } ], "source": [ - "input_ids = tokenizer.encode('terjemah Inggeris ke Melayu: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')\n", + "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: mesolitica boleh buat asr tak', return_tensors = 'pt')\n", "outputs = model.generate(input_ids, max_length = 100)\n", "print(tokenizer.decode(outputs[0]))" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " I don't understand la\n", - " At 8 o'clock in the market, it's a great place to choose.\n", - " So it's a fucking shit.\n", - " Where do you want to go?\n", - " It's like taking half a day\n", - " I've been making this gengs and sold haha salad only k and haha drinks only k.\n", - " I'll see what tickets from Kuala Lumpur are at.\n", - " Imagine PH and win pru-14. There are many back doorways. Last-last Ismail Sabri goes up. That's why I don't give a fk about politics anymore. I swear I'm fk up.\n" + " Brother, I want to copy it on Facebook...hahaha. If not, I'll ss\n" ] } ], "source": [ - "strings = [\n", - " 'ak tak paham la',\n", - " 'jam 8 di pasar KK memang org ramai 😂, pandai dia pilih tmpt.',\n", - " 'Jadi haram jadah😀😃🤭',\n", - " 'nak gi mana tuu',\n", - " 'Macam nak ambil half day',\n", - " 'jadi aku tadi bikin ini gengs dan dijual haha salad only k dan haha drinks only k',\n", - " 'nanti aku tengok dulu tiket dari Kuala Lumpur pukul berapa ada ya',\n", - " \"Bayangkan PH dan menang pru-14. Pastu macam-macam pintu belakang ada. Last-last Ismail Sabri naik. That's why I don't give a fk about politics anymore. Sumpah dah fk up dah.\",\n", - "]\n", - "for s in strings:\n", - " input_ids = tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors = 'pt')\n", - " outputs = model.generate(input_ids, max_length = 100)\n", - " print(tokenizer.decode(outputs[0]))" + "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: Bang, aku nak copy masuk kat fb…hahaha. Kalau xleh aku ss', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " u ni, bercakap lah\n", - " baru sahaja menghadiri majlis perkahwinan sepupu saya. pelik jugak dia buat biasa2 je sebab gaya hidupnya kelihatan mewah. kemudian saya dapati mereka akan berbulan madu 3 minggu. keputusan pintar \n", - " Saya selepas melihat video ini: saya dapnya burger benjo extra mayo\n", - " power lah even shopback datang edmw riao\n", - " Hai kawan-kawan! Saya perhatikan semalam & harini dah ramai yang dapat cookies ni kan. Jadi harini saya nak kongsi beberapa post mortem kumpulan pertama kami:\n" + " Memang betul. Ini tidak perlu menjadi pakar, saya juga tahu. Ia adalah isyarat, bodoh.\n" ] } ], "source": [ - "strings = [\n", - " 'u ni, talk properly lah',\n", - " \"just attended my cousin's wedding. pelik jugak dia buat majlis biasa2 je sebab her lifestyle looks lavish. then i found out they're going on a 3 weeks honeymoon. smart decision 👍\",\n", - " 'Me after seeing this video: mm dapnya burger benjo extra mayo',\n", - " 'power lah even shopback come to edmw riao',\n", - " 'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',\n", - "]\n", - "for s in strings:\n", - " input_ids = tokenizer.encode(f'terjemah Inggeris ke Melayu: {s}', return_tensors = 'pt')\n", - " outputs = model.generate(input_ids, max_length = 100)\n", - " print(tokenizer.decode(outputs[0]))" + "input_ids = tokenizer.encode(\"terjemah pasar Melayu ke Melayu: Memanglah. Ini tak payah expert, aku pun tau. It's a gesture, bodoh.\", return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443\n" + " Saya menulis untuk memohon jawatan Senior Software Engineer di [Company]. Sebagai seorang jurutera perisian yang sangat mahir dan berpengalaman dengan pengkhususan dalam seni bina data besar dan pemprosesan bahasa semula jadi, saya percaya saya akan menjadi tambahan berharga kepada pasukan anda\n" ] } ], "source": [ - "model.push_to_hub('finetune-noisy-translation-t5-small-bahasa-cased', organization='mesolitica')" + "input_ids = tokenizer.encode('terjemah Inggeris ke pasar Melayu: I am writing to apply for the Senior Software Engineer position at [Company]. As a highly skilled and experienced software engineer with a specialization in big data architecture and natural language processing, I believe I would be a valuable addition to your team', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Jadi aku wasap mak aku yang aku sayang malam tu juga. Sambil tu aku tengah makan western food tu tapi.. aku tak sangka makanan tu sedap sebab aku tengah fikir nak settlekan benda yang ada dalam otak aku ni\n" + ] + } + ], "source": [ - "tokenizer.push_to_hub('finetune-noisy-translation-t5-small-bahasa-cased', organization='mesolitica')" + "input_ids = tokenizer.encode(\"terjemah Inggeris ke pasar Melayu: So I smoked my mom whom I love that night too. Meanwhile, I was eating the western food but.. I didn't think that the food was good because I was thinking about settling the things that are in my brain\", return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['nous', 'can', 'do', 'asr', 'not?']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "!cp -r finetune-t5-tiny-noisy-bahasa-cased/runs finetune-noisy-translation-t5-tiny-bahasa-cased\n", - "!cd finetune-noisy-translation-t5-tiny-bahasa-cased && git add . && git commit -m 'add tensorboard' && git push" + "t = 'nous bleh buat asr tak?'\n", + "input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors='pt')[\n", + " 0]} for s in t.split()]\n", + "padded = tokenizer.pad(input_ids, padding='longest')\n", + "outputs = model.generate(**padded, max_length = 50)\n", + "tokenizer.batch_decode(outputs, skip_special_tokens=True)" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " asr\n" + ] + } + ], "source": [ - "from sacrebleu.metrics import BLEU, CHRF, TER\n", - "\n", - "bleu = BLEU()\n", - "chrf = CHRF(word_order = 2)" + "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: asr', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))\n" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Is there a promo?\n" + ] + } + ], + "source": [ + "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: ada promo x?', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " is there a promo?\n" + ] + } + ], + "source": [ + "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: ada promo?', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " mesolitica can b\n" + ] + } + ], + "source": [ + "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: mesolitica bleh b', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " got a promo?\n" + ] + } + ], + "source": [ + "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: got promo?', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "6854" + "['Saya sedang menulis untuk memohon jawatan Senior Software Engineer di [ Kompany]. Sebagai seorang jurutera perisian yang sangat mahir dan berpengalaman dengan ada pengkhususan dalam seni bina data besar dan pemprosesan bahasa semula jadi, saya percaya saya akan menjadi tambahan berharga buat pasukan anda',\n", + " 'Saya sedang menulis untuk memohon jawatan Jurutera Software Kanan di [ Kompany ]. Sebagai seorang jurutera perisian yang sangat mahir dan berpengalaman dengan khasisasi dalam seni bina data besar dan pemprosesan bahasa semula jadi, saya percaya saya akan menjadi tambahan yang berharga kepada pasukan anda',\n", + " 'Saya menulis untuk memohon jawatan Senior Software Engineer di [ Syarikat ]. Sebagai seorang jurutera perisian yang berkemahiran tinggi dan berpengalaman dengan khasisasi dalam seni bina data besar dan pemprosesan bahasa semula jadi, saya percaya saya akan menjadi penambahan yang berharga kepada pasukan anda']" ] }, - "execution_count": 9, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from unidecode import unidecode\n", - "import json\n", - "\n", - "with open('test-noisy-shuffled.json') as fopen:\n", - " test = fopen.read().split('\\n')\n", - " test = [json.loads(t) for t in test if len(t)]\n", - " \n", - "len(test)" + "input_ids = tokenizer.encode('terjemah Inggeris ke pasar Melayu: I am writing to apply for the Senior Software Engineer position at [Company]. As a highly skilled and experienced software engineer with a specialization in big data architecture and natural language processing, I believe I would be a valuable addition to your team', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100, do_sample=True, \n", + " top_k=100, \n", + " top_p=0.95, temperature=0.7,\n", + " num_return_sequences=3)\n", + "tokenizer.batch_decode(outputs, skip_special_tokens=True)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 20, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|████████████████████████████████████████████████████████████████████████████████████████████| 6854/6854 [25:43<00:00, 4.44it/s]\n" - ] + "data": { + "text/plain": [ + "['Rakyat mmg tak suka kau jugak',\n", + " 'rakyat x suka ko pun',\n", + " 'Rakyat mmg tak suka kau pon']" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "from tqdm import tqdm\n", - "\n", - "batch_size = 1\n", - "\n", - "results_en_ms, filtered_right_en_ms = [], []\n", - "results_ms_en, filtered_right_ms_en = [], []\n", - "for i in tqdm(range(len(test))):\n", - " t = test[i]['translation']\n", - " p = t['prefix']\n", - " s = t['src']\n", - " tgt = t['tgt']\n", - " \n", - " input_ids = [{'input_ids': tokenizer.encode(f'{p}{s}', return_tensors = 'pt')[0]}]\n", - " padded = tokenizer.pad(input_ids, padding = 'longest')\n", - " outputs = model.generate(**padded, max_length = 1000)[0]\n", - " o = tokenizer.decode(outputs, skip_special_tokens=True)\n", - " if len(o):\n", - " if 'Inggeris ke Melayu' in p:\n", - " results_en_ms.append(o)\n", - " filtered_right_en_ms.append(tgt)\n", - " else:\n", - " results_ms_en.append(o)\n", - " filtered_right_ms_en.append(tgt)" + "input_ids = tokenizer.encode(\"terjemah Melayu ke pasar Melayu: rakyat memang tak suka awak pun\", return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100, do_sample=True, \n", + " top_k=50, \n", + " top_p=0.95, \n", + " temperature=0.7,\n", + " num_return_sequences=3)\n", + "tokenizer.batch_decode(outputs, skip_special_tokens=True)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(2937, 3917)" + "['Perbincangan khas itu juga bertujuan utk Seri Paduka Baginda mendapat pandangan Raja2 Melayu bagi membolehkan baginda membuat keputusan terbaik demi dan kesejahteraan negara serta rakyat',\n", + " 'Perbincangan khas itu juga bertujuan untuk Baginda mendapat pandangan Raja2 Melayu bagi membolehkan baginda membuat keputusan yang terbaik demi dan kesejahteraan negara dan rakyat',\n", + " 'Perbincangan khas itu juga bertujuan agar Baginda mendapat pandangan Raja2 Melayu bagi membolehkan Baginda membuat keputusan yang terbaik demi kepentingan dan kesejahteraan negara serta rakyat']" ] }, - "execution_count": 11, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "len(results_en_ms), len(results_ms_en)" + "input_ids = tokenizer.encode(\"terjemah Melayu ke pasar Melayu: Perbincangan khas itu juga bertujuan bagi Seri Paduka mendapat pandangan Raja-Raja Melayu untuk membolehkan baginda membuat keputusan yang terbaik demi kepentingan dan kesejahteraan negara serta rakyat\", return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100, do_sample=True, \n", + " top_k=100, \n", + " top_p=0.95,\n", + " temperature=0.7,\n", + " num_return_sequences=3)\n", + "tokenizer.batch_decode(outputs, skip_special_tokens=True)" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "({'name': 'BLEU',\n", - " 'score': 41.15794003172596,\n", - " '_mean': -1.0,\n", - " '_ci': -1.0,\n", - " '_verbose': '72.2/48.8/34.5/24.8 (BP = 0.988 ratio = 0.988 hyp_len = 63689 ref_len = 64473)',\n", - " 'bp': 0.9877656378545313,\n", - " 'counts': [45968, 29622, 19968, 13610],\n", - " 'totals': [63689, 60752, 57815, 54878],\n", - " 'sys_len': 63689,\n", - " 'ref_len': 64473,\n", - " 'precisions': [72.17572893278275,\n", - " 48.758888596260206,\n", - " 34.537749718931074,\n", - " 24.800466489303545],\n", - " 'prec_str': '72.2/48.8/34.5/24.8',\n", - " 'ratio': 0.9878398709537326},\n", - " chrF2++ = 65.51)" + "['can', 'do', 'asr', 'no']" ] }, - "execution_count": 12, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "refs = [filtered_right_en_ms]\n", - "sys = results_en_ms\n", - "r = bleu.corpus_score(sys, refs)\n", - "r.__dict__, chrf.corpus_score(sys, refs)" + "t = 'bleh buat asr tak'\n", + "input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors='pt')[\n", + " 0]} for s in t.split()]\n", + "padded = tokenizer.pad(input_ids, padding='longest')\n", + "outputs = model.generate(**padded, max_length = 50)\n", + "tokenizer.batch_decode(outputs, skip_special_tokens=True)\n" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " can you do asr or not?\n" + ] + } + ], + "source": [ + "input_ids = tokenizer.encode(f'terjemah Melayu ke Inggeris: {t}', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "({'name': 'BLEU',\n", - " 'score': 41.83407099646298,\n", - " '_mean': -1.0,\n", - " '_ci': -1.0,\n", - " '_verbose': '71.7/48.7/35.4/26.0 (BP = 0.989 ratio = 0.989 hyp_len = 91952 ref_len = 92985)',\n", - " 'bp': 0.9888287449603974,\n", - " 'counts': [65929, 42830, 29766, 20815],\n", - " 'totals': [91952, 88035, 84118, 80201],\n", - " 'sys_len': 91952,\n", - " 'ref_len': 92985,\n", - " 'precisions': [71.6993648860275,\n", - " 48.65110467427728,\n", - " 35.386005373404025,\n", - " 25.95354172641239],\n", - " 'prec_str': '71.7/48.7/35.4/26.0',\n", - " 'ratio': 0.9888906812926817},\n", - " chrF2++ = 64.52)" + "[\"I don't understand\",\n", + " 'Hi guys! I noticed yesterday & today many people have got these cookies, right? So today I want to share some post mortem of our first batch:',\n", + " \"Indeed. This doesn't need to be an expert, I know too. It's a gesture, stupid.\",\n", + " \"at 8 o'clock at the OKAY market, it's really crowded, he's good at choosing a place.\",\n", + " \"So it's illegal\",\n", + " 'where do you want to go?',\n", + " \"It's like taking half a day\",\n", + " \"Imagine Pakatan Harapan and win pru-14. After that there are all kinds of back doors. Last-last Ismail Sabri went up. That's why I pray not to give a fk about politics anymore. I swear it's up.\"]" ] }, - "execution_count": 13, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "refs = [filtered_right_ms_en]\n", - "sys = results_ms_en\n", - "r = bleu.corpus_score(sys, refs)\n", - "r.__dict__, chrf.corpus_score(sys, refs)" + "strings = [\n", + " 'ak tak paham la',\n", + " 'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',\n", + " \"Memanglah. Ini tak payah expert, aku pun tau. It's a gesture, bodoh.\",\n", + " 'jam 8 di pasar KK memang org ramai 😂, pandai dia pilih tmpt.',\n", + " 'Jadi haram jadah😀😃🤭',\n", + " 'nak gi mana tuu',\n", + " 'Macam nak ambil half day',\n", + " \"Bayangkan PH dan menang pru-14. Pastu macam-macam pintu belakang ada. Last-last Ismail Sabri naik. That's why I don't give a fk about politics anymore. Sumpah dah fk up dah.\",\n", + "]\n", + "input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors='pt')[\n", + " 0]} for s in strings]\n", + "padded = tokenizer.pad(input_ids, padding='longest')\n", + "outputs = model.generate(**padded, max_length = 100)\n", + "tokenizer.batch_decode(outputs, skip_special_tokens=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ini awak, bercakap dengan betul',\n", + " 'baru menghadiri majlis perkahwinan sepupu saya. Pelik juga dia buat majlis biasa-biasa sebab gaya hidup dia nampak mewah. kemudian saya mendapat tahu mereka akan berbulan madu selama 3 minggu. keputusan pintar',\n", + " 'Saya selepas melihat video ini: memang sedap burger benjo extra mayo',\n", + " 'Hai semua! Saya perasan semalam & hari ni ramai yang dapat biskut ni kan? Jadi hari ini saya ingin berkongsi beberapa post mortem batch pertama kami:']" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "strings = [\n", + " 'u ni, talk properly lah',\n", + " \"just attended my cousin's wedding. pelik jugak dia buat majlis biasa2 je sebab her lifestyle looks lavish. then i found out they're going on a 3 weeks honeymoon. smart decision 👍\",\n", + " 'Me after seeing this video: mm dapnya burger benjo extra mayo',\n", + " 'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',\n", + "]\n", + "input_ids = [{'input_ids': tokenizer.encode(f'terjemah pasar Melayu ke Melayu: {s}', return_tensors='pt')[\n", + " 0]} for s in strings]\n", + "padded = tokenizer.pad(input_ids, padding='longest')\n", + "outputs = model.generate(**padded, max_length = 100)\n", + "tokenizer.batch_decode(outputs, skip_special_tokens=True)" ] }, { @@ -373,7 +524,117 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "model.push_to_hub('finetune-noisy-translation-t5-small-bahasa-cased-v4', organization='mesolitica')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "tokenizer.push_to_hub('finetune-noisy-translation-t5-small-bahasa-cased-v4', organization='mesolitica')" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "_ = model.cuda()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "5000it [07:05, 11.76it/s]\n" + ] + } + ], + "source": [ + "filtered_left, filtered_right = [], []\n", + "\n", + "with open('shuffled-test.json') as fopen:\n", + " for l in tqdm(fopen):\n", + " data = json.loads(l)['translation']\n", + " p = data['prefix']\n", + " src = data['src']\n", + " input_ids = [{'input_ids': tokenizer.encode(f'{p}: {src}', return_tensors = 'pt')[0]}]\n", + " padded = tokenizer.pad(input_ids, padding = 'longest')\n", + " for k in padded.keys():\n", + " padded[k] = padded[k].cuda()\n", + " outputs = model.generate(**padded, max_length = 256)\n", + " filtered_left.append(tokenizer.decode(outputs[0], skip_special_tokens=True))\n", + " filtered_right.append(data['tgt'])" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "refs = [filtered_right]\n", + "sys = filtered_left" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "from sacrebleu.metrics import BLEU, CHRF, TER\n", + "\n", + "bleu = BLEU()\n", + "chrf = CHRF(word_order = 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'name': 'BLEU',\n", + " 'score': 64.06258219941243,\n", + " '_mean': -1.0,\n", + " '_ci': -1.0,\n", + " '_verbose': '80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 hyp_len = 111635 ref_len = 107150)',\n", + " 'bp': 1.0,\n", + " 'counts': [89388, 72166, 60115, 50792],\n", + " 'totals': [111635, 106635, 101635, 96656],\n", + " 'sys_len': 111635,\n", + " 'ref_len': 107150,\n", + " 'precisions': [80.07166211313655,\n", + " 67.67571622825527,\n", + " 59.14793132287106,\n", + " 52.549246813441485],\n", + " 'prec_str': '80.1/67.7/59.1/52.5',\n", + " 'ratio': 1.0418572095193654}" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r = bleu.corpus_score(sys, refs)\n", + "r.__dict__" + ] } ], "metadata": { diff --git a/session/translation/noisy-hf-t5/export/tiny.ipynb b/session/translation/noisy-hf-t5/export/tiny.ipynb index 79031f5a..d3fc89e4 100644 --- a/session/translation/noisy-hf-t5/export/tiny.ipynb +++ b/session/translation/noisy-hf-t5/export/tiny.ipynb @@ -8,30 +8,34 @@ "source": [ "import os\n", "\n", - "os.environ['CUDA_VISIBLE_DEVICES'] = ''" + "os.environ['CUDA_VISIBLE_DEVICES'] = '0'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, + "outputs": [], + "source": [ + "from transformers import T5Tokenizer, T5ForConditionalGeneration\n", + "\n", + "tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['finetune-t5-tiny-noisy-bahasa-cased/checkpoint-2790000',\n", - " 'finetune-t5-tiny-noisy-bahasa-cased/checkpoint-2800000',\n", - " 'finetune-t5-tiny-noisy-bahasa-cased/checkpoint-2810000',\n", - " 'finetune-t5-tiny-noisy-bahasa-cased/checkpoint-2820000',\n", - " 'finetune-t5-tiny-noisy-bahasa-cased/checkpoint-2830000',\n", - " 'finetune-t5-tiny-noisy-bahasa-cased/checkpoint-2840000',\n", - " 'finetune-t5-tiny-noisy-bahasa-cased/checkpoint-2850000',\n", - " 'finetune-t5-tiny-noisy-bahasa-cased/checkpoint-2860000',\n", - " 'finetune-t5-tiny-noisy-bahasa-cased/checkpoint-2870000',\n", - " 'finetune-t5-tiny-noisy-bahasa-cased/checkpoint-2880000']" + "['finetune-t5-tiny-standard-bahasa-cased-combined/checkpoint-1150000',\n", + " 'finetune-t5-tiny-standard-bahasa-cased-combined/checkpoint-1160000',\n", + " 'finetune-t5-tiny-standard-bahasa-cased-combined/checkpoint-1170000']" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -39,32 +43,29 @@ "source": [ "from glob import glob\n", "\n", - "checkpoints = sorted(glob('finetune-t5-tiny-noisy-bahasa-cased/checkpoint-*'))\n", + "checkpoints = sorted(glob('finetune-t5-tiny-standard-bahasa-cased-combined/checkpoint-*'))\n", "checkpoints" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "from transformers import T5Tokenizer, T5ForConditionalGeneration\n", - "\n", - "tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')\n", "model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " Hi guys! I noticed yesterday and today many of our cookies are available. So today I want to share some of our first batch post mortem:\n" + " Hi guys! I noticed yesterday & today many people got this cookies, right? So today I want to share some post mortem of our first batch:\n" ] } ], @@ -76,343 +77,473 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " Hai kawan! Saya perhatikan semalam & harini dah ramai yang dapat cookies ni kan. Jadi harini saya nak kongsi post mortem batch pertama kami:\n" + " Mesolitica can make asr or not\n" ] } ], "source": [ - "input_ids = tokenizer.encode('terjemah Inggeris ke Melayu: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')\n", + "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: mesolitica boleh buat asr tak', return_tensors = 'pt')\n", "outputs = model.generate(input_ids, max_length = 100)\n", "print(tokenizer.decode(outputs[0]))" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " I don't understand\n", - " 8th in the KK market, he was good at choosing a place.\n", - " So it's illegal.\n", - " Where do you want to go?\n", - " It's like taking half a day\n", - " Imagine PH and win pru-14. The past is a bit of a back door. Ismail Sabri's last-last rise. That's why I don't give a fk about politics anymore. I swear it's up.\n" + " Brother, I want to copy it on fb...hahaha. If I can't ss\n" ] } ], "source": [ - "strings = [\n", - " 'ak tak paham la',\n", - " 'jam 8 di pasar KK memang org ramai 😂, pandai dia pilih tmpt.',\n", - " 'Jadi haram jadah😀😃🤭',\n", - " 'nak gi mana tuu',\n", - " 'Macam nak ambil half day',\n", - " \"Bayangkan PH dan menang pru-14. Pastu macam-macam pintu belakang ada. Last-last Ismail Sabri naik. That's why I don't give a fk about politics anymore. Sumpah dah fk up dah.\",\n", - "]\n", - "for s in strings:\n", - " input_ids = tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors = 'pt')\n", - " outputs = model.generate(input_ids, max_length = 100)\n", - " print(tokenizer.decode(outputs[0]))" + "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: Bang, aku nak copy masuk kat fb…hahaha. Kalau xleh aku ss', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " u ni, cakap betul lah\n", - " baru sahaja menghadiri majlis perkahwinan sepupu saya. pelik jugak dia buat majlis biasa2 je sebab gaya hidupnya kelihatan mewah. kemudian saya tahu mereka akan berbulan madu 3 minggu. keputusan pintar \n", - " Saya setelah melihat video ini: mm dapnya burger benjo extra mayo\n", - " Hai kawan! Saya perhatikan semalam & harini dah ramai yang dapat cookies ni kan. Jadi harini saya nak kongsi post mortem batch pertama kami:\n" + " Brother, I want to copy it on fb...hahaha. If I can't ss\n" ] } ], "source": [ - "strings = [\n", - " 'u ni, talk properly lah',\n", - " \"just attended my cousin's wedding. pelik jugak dia buat majlis biasa2 je sebab her lifestyle looks lavish. then i found out they're going on a 3 weeks honeymoon. smart decision 👍\",\n", - " 'Me after seeing this video: mm dapnya burger benjo extra mayo',\n", - " 'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',\n", - "]\n", - "for s in strings:\n", - " input_ids = tokenizer.encode(f'terjemah Inggeris ke Melayu: {s}', return_tensors = 'pt')\n", - " outputs = model.generate(input_ids, max_length = 100)\n", - " print(tokenizer.decode(outputs[0]))" + "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: Bang, aku nak copy masuk kat fb…hahaha. Kalau xleh aku ss', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e6cd404bd5e04405a00d9cf954586690", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Upload file pytorch_model.bin: 0%| | 4.00k/133M [00:00 Memang betul. Ini tidak perlu pakar, saya juga tahu. Ia adalah isyarat, bodoh.\n" + ] + } + ], + "source": [ + "input_ids = tokenizer.encode(\"terjemah pasar Melayu ke Melayu: Memanglah. Ini tak payah expert, aku pun tau. It's a gesture, bodoh.\", return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "remote: Scanning LFS files for validity, may be slow... \n", - "remote: LFS file scan complete. \n", - "To https://huggingface.co/mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased\n", - " 3d32735..3be598e main -> main\n", - "\n" + " Saya menulis untuk memohon jawatan Senior Software Engineer di [Company]. Sebagai seorang jurutera perisian yang mahir dan berpengalaman yang mempunyai kepakaran dalam seni bina data besar dan pemprosesan bahasa semulajadi, saya percaya saya akan menjadi tambahan berharga kepada pasukan anda\n" ] - }, + } + ], + "source": [ + "input_ids = tokenizer.encode('terjemah Inggeris ke pasar Melayu: I am writing to apply for the Senior Software Engineer position at [Company]. As a highly skilled and experienced software engineer with a specialization in big data architecture and natural language processing, I believe I would be a valuable addition to your team', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Jadi aku hisap mak aku yang aku sayang malam tu juga. Sedangkan aku tengah makan makanan barat tapi.. tak sangka makanan tu sedap sebab aku terfikir nak settlekan benda yang ada dalam otak aku\n" + ] + } + ], + "source": [ + "input_ids = tokenizer.encode(\"terjemah Inggeris ke pasar Melayu: So I smoked my mom whom I love that night too. Meanwhile, I was eating the western food but.. I didn't think that the food was good because I was thinking about settling the things that are in my brain\", return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ { "data": { "text/plain": [ - "'https://huggingface.co/mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased/commit/3be598e914c794d8ef31fede774bba6ca8e55a5d'" + "['can', 'do it', 'asr', 'no']" ] }, - "execution_count": 8, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "model.push_to_hub('finetune-noisy-translation-t5-tiny-bahasa-cased', organization='mesolitica')" + "t = 'bleh buat asr tak'\n", + "input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors='pt')[\n", + " 0]} for s in t.split()]\n", + "padded = tokenizer.pad(input_ids, padding='longest')\n", + "outputs = model.generate(**padded, max_length = 50)\n", + "tokenizer.batch_decode(outputs, skip_special_tokens=True)\n" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " can you make an asr?\n" + ] + } + ], "source": [ - "tokenizer.push_to_hub('finetune-noisy-translation-t5-tiny-bahasa-cased', organization='mesolitica')" + "input_ids = tokenizer.encode(f'terjemah Melayu ke Inggeris: {t}', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[main 99b33f9] add tensorboard\n", - " 1 file changed, 2 insertions(+), 2 deletions(-)\n", - "Uploading LFS objects: 100% (1/1), 2.6 MB | 342 KB/s, done. \n", - "Enumerating objects: 9, done.\n", - "Counting objects: 100% (9/9), done.\n", - "Delta compression using up to 16 threads\n", - "Compressing objects: 100% (5/5), done.\n", - "Writing objects: 100% (5/5), 488 bytes | 488.00 KiB/s, done.\n", - "Total 5 (delta 3), reused 0 (delta 0)\n", - "remote: Scanning LFS files for validity, may be slow...\u001b[K\n", - "remote: LFS file scan complete.\u001b[K\n", - "To https://huggingface.co/mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased\n", - " 3be598e..99b33f9 main -> main\n" + " asr\n" ] } ], "source": [ - "!cp -r finetune-t5-tiny-noisy-bahasa-cased/runs finetune-noisy-translation-t5-tiny-bahasa-cased\n", - "!cd finetune-noisy-translation-t5-tiny-bahasa-cased && git add . && git commit -m 'add tensorboard' && git push" + "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: asr', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))\n" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Is there a promo?\n" + ] + } + ], "source": [ - "from sacrebleu.metrics import BLEU, CHRF, TER\n", - "\n", - "bleu = BLEU()\n", - "chrf = CHRF(word_order = 2)" + "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: ada promo x?', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " is there a promo?\n" + ] + } + ], + "source": [ + "input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: ada promo?', return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100)\n", + "print(tokenizer.decode(outputs[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "6854" + "['Perbincangan khas itu juga bertujuan untuk Seri Paduka mendapat pandangan Raja2 Melayu untuk membolehkan baginda membuat keputusan yg terbaik demi kepentingan dan kesejahteraan negara dan rakyat',\n", + " 'Perbincangan khas juga bertujuan untuk Seri Paduka dapat pandangan Raja-Raja Melayu bagi membolehkan baginda membuat keputusan terbaik demi kepentingan dan kesejahteraan negara serta rakyat',\n", + " 'Perbincangan khas itu juga bertujuan untuk Seri Paduka meraih pandangan Raja-Raja Melayu untuk membolehkan baginda membuat keputusan yang terbaik demi kepentingan dan kesejahteraan negara serta rakyat']" ] }, - "execution_count": 12, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from unidecode import unidecode\n", - "import json\n", - "\n", - "with open('test-noisy-shuffled.json') as fopen:\n", - " test = fopen.read().split('\\n')\n", - " test = [json.loads(t) for t in test if len(t)]\n", - " \n", - "len(test)" + "input_ids = tokenizer.encode(\"terjemah Melayu ke pasar Melayu: Perbincangan khas itu juga bertujuan bagi Seri Paduka mendapat pandangan Raja-Raja Melayu untuk membolehkan baginda membuat keputusan yang terbaik demi kepentingan dan kesejahteraan negara serta rakyat\", return_tensors = 'pt')\n", + "outputs = model.generate(input_ids, max_length = 100, do_sample=True, \n", + " top_k=100, \n", + " top_p=0.95,\n", + " temperature=0.7,\n", + " num_return_sequences=3)\n", + "tokenizer.batch_decode(outputs, skip_special_tokens=True)" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 18, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|████████████████████████████████████████████████████████████████████████████████████████████| 6854/6854 [17:29<00:00, 6.53it/s]\n" - ] + "data": { + "text/plain": [ + "[\"I don't understand\",\n", + " 'Hi guys! I noticed yesterday & today many people got this cookies, right? So today I want to share some post mortem of our first batch:',\n", + " \"That's it. This doesn't need an expert, I know too. It's a gesture, stupid.\",\n", + " \"at 8 o'clock at the OKAY market, there are many people, he is good at choosing places.\",\n", + " 'So haram jadah',\n", + " 'where do you want to go?',\n", + " \"It's like taking half day\",\n", + " \"Imagine PAKATAN HARAPAN and won pru-14. After that there are all kinds of back doors. Last-last Ismail Sabri went up. That's why I don't give it fk about politics anymore. I swear it's already up.\"]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "from tqdm import tqdm\n", - "\n", - "batch_size = 1\n", - "\n", - "results_en_ms, filtered_right_en_ms = [], []\n", - "results_ms_en, filtered_right_ms_en = [], []\n", - "for i in tqdm(range(len(test))):\n", - " t = test[i]['translation']\n", - " p = t['prefix']\n", - " s = t['src']\n", - " tgt = t['tgt']\n", - " \n", - " input_ids = [{'input_ids': tokenizer.encode(f'{p}{s}', return_tensors = 'pt')[0]}]\n", - " padded = tokenizer.pad(input_ids, padding = 'longest')\n", - " outputs = model.generate(**padded, max_length = 1000)[0]\n", - " o = tokenizer.decode(outputs, skip_special_tokens=True)\n", - " if len(o):\n", - " if 'Inggeris ke Melayu' in p:\n", - " results_en_ms.append(o)\n", - " filtered_right_en_ms.append(tgt)\n", - " else:\n", - " results_ms_en.append(o)\n", - " filtered_right_ms_en.append(tgt)" + "strings = [\n", + " 'ak tak paham la',\n", + " 'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',\n", + " \"Memanglah. Ini tak payah expert, aku pun tau. It's a gesture, bodoh.\",\n", + " 'jam 8 di pasar KK memang org ramai 😂, pandai dia pilih tmpt.',\n", + " 'Jadi haram jadah😀😃🤭',\n", + " 'nak gi mana tuu',\n", + " 'Macam nak ambil half day',\n", + " \"Bayangkan PH dan menang pru-14. Pastu macam-macam pintu belakang ada. Last-last Ismail Sabri naik. That's why I don't give a fk about politics anymore. Sumpah dah fk up dah.\",\n", + "]\n", + "input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors='pt')[\n", + " 0]} for s in strings]\n", + "padded = tokenizer.pad(input_ids, padding='longest')\n", + "outputs = model.generate(**padded, max_length = 100)\n", + "tokenizer.batch_decode(outputs, skip_special_tokens=True)" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(2937, 3917)" + "['awak, cakap betul-betul',\n", + " 'baru sahaja menghadiri perkahwinan sepupu saya. Pelik juga dia buat majlis biasa sebab gaya hidup dia nampak mewah. kemudian saya mendapat tahu mereka sedang berbulan madu selama 3 minggu. keputusan yang bijak',\n", + " 'Saya selepas melihat video ini: memang sedap burger benjo extra mayo',\n", + " 'Hai kawan-kawan! Saya perasan semalam & hari ni ramai yang dapat biskut ni kan? Jadi hari ini saya ingin berkongsi beberapa post mortem batch pertama kami:']" ] }, - "execution_count": 46, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "len(results_en_ms), len(results_ms_en)" + "strings = [\n", + " 'u ni, talk properly lah',\n", + " \"just attended my cousin's wedding. pelik jugak dia buat majlis biasa2 je sebab her lifestyle looks lavish. then i found out they're going on a 3 weeks honeymoon. smart decision 👍\",\n", + " 'Me after seeing this video: mm dapnya burger benjo extra mayo',\n", + " 'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',\n", + "]\n", + "input_ids = [{'input_ids': tokenizer.encode(f'terjemah pasar Melayu ke Melayu: {s}', return_tensors='pt')[\n", + " 0]} for s in strings]\n", + "padded = tokenizer.pad(input_ids, padding='longest')\n", + "outputs = model.generate(**padded, max_length = 100)\n", + "tokenizer.batch_decode(outputs, skip_special_tokens=True)" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 20, "metadata": {}, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a1a68f08451942718fa21208b1f923c6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Upload file pytorch_model.bin: 0%| | 4.00k/133M [00:00 main\n", + "\n" + ] + }, { "data": { "text/plain": [ - "({'name': 'BLEU',\n", - " 'score': 41.03641425544081,\n", - " '_mean': -1.0,\n", - " '_ci': -1.0,\n", - " '_verbose': '72.9/49.2/34.8/25.0 (BP = 0.977 ratio = 0.977 hyp_len = 63005 ref_len = 64473)',\n", - " 'bp': 0.976969604853212,\n", - " 'counts': [45920, 29534, 19878, 13530],\n", - " 'totals': [63005, 60068, 57131, 54194],\n", - " 'sys_len': 63005,\n", - " 'ref_len': 64473,\n", - " 'precisions': [72.88310451551465,\n", - " 49.16761004195246,\n", - " 34.793719696837094,\n", - " 24.96586337970993],\n", - " 'prec_str': '72.9/49.2/34.8/25.0',\n", - " 'ratio': 0.9772307787756115},\n", - " chrF2++ = 65.58)" + "'https://huggingface.co/mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2/commit/d0f6650575b99ba9241a410eb6abad68946f4cb5'" ] }, - "execution_count": 47, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "refs = [filtered_right_en_ms]\n", - "sys = results_en_ms\n", - "r = bleu.corpus_score(sys, refs)\n", - "r.__dict__, chrf.corpus_score(sys, refs)" + "model.push_to_hub('finetune-noisy-translation-t5-tiny-bahasa-cased-v2', organization='mesolitica')" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer.push_to_hub('finetune-noisy-translation-t5-tiny-bahasa-cased-v2', organization='mesolitica')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "_ = model.cuda()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "5000it [09:26, 8.83it/s]\n" + ] + } + ], + "source": [ + "from tqdm import tqdm\n", + "import json\n", + "\n", + "filtered_left, filtered_right = [], []\n", + "\n", + "with open('shuffled-test.json') as fopen:\n", + " for l in tqdm(fopen):\n", + " data = json.loads(l)['translation']\n", + " p = data['prefix']\n", + " src = data['src']\n", + " input_ids = [{'input_ids': tokenizer.encode(f'{p}: {src}', return_tensors = 'pt')[0]}]\n", + " padded = tokenizer.pad(input_ids, padding = 'longest')\n", + " for k in padded.keys():\n", + " padded[k] = padded[k].cuda()\n", + " outputs = model.generate(**padded, max_length = 256)\n", + " filtered_left.append(tokenizer.decode(outputs[0], skip_special_tokens=True))\n", + " filtered_right.append(data['tgt'])" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "refs = [filtered_right]\n", + "sys = filtered_left" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "from sacrebleu.metrics import BLEU, CHRF, TER\n", + "\n", + "bleu = BLEU()\n", + "chrf = CHRF(word_order = 2)" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "({'name': 'BLEU',\n", - " 'score': 39.72513374635353,\n", - " '_mean': -1.0,\n", - " '_ci': -1.0,\n", - " '_verbose': '69.8/46.2/32.8/23.6 (BP = 0.999 ratio = 0.999 hyp_len = 92913 ref_len = 92985)',\n", - " 'bp': 0.9992253816996589,\n", - " 'counts': [64860, 41099, 27914, 19169],\n", - " 'totals': [92913, 88996, 85079, 81162],\n", - " 'sys_len': 92913,\n", - " 'ref_len': 92985,\n", - " 'precisions': [69.80723903006037,\n", - " 46.18072722369545,\n", - " 32.80950645870309,\n", - " 23.61819570735073],\n", - " 'prec_str': '69.8/46.2/32.8/23.6',\n", - " 'ratio': 0.9992256815615422},\n", - " chrF2++ = 63.16)" + "{'name': 'BLEU',\n", + " 'score': 60.0009672168891,\n", + " '_mean': -1.0,\n", + " '_ci': -1.0,\n", + " '_verbose': '77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 hyp_len = 110970 ref_len = 107150)',\n", + " 'bp': 1.0,\n", + " 'counts': [86448, 67686, 55157, 45770],\n", + " 'totals': [110970, 105970, 100970, 95989],\n", + " 'sys_len': 110970,\n", + " 'ref_len': 107150,\n", + " 'precisions': [77.90213571235469,\n", + " 63.87279418703407,\n", + " 54.62711696543528,\n", + " 47.68254695850566],\n", + " 'prec_str': '77.9/63.9/54.6/47.7',\n", + " 'ratio': 1.035650956602893}" ] }, - "execution_count": 48, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "refs = [filtered_right_ms_en]\n", - "sys = results_ms_en\n", "r = bleu.corpus_score(sys, refs)\n", - "r.__dict__, chrf.corpus_score(sys, refs)" + "r.__dict__" ] }, { diff --git a/session/translation/noisy-hf-t5/prepare-data.ipynb b/session/translation/noisy-hf-t5/prepare-data.ipynb new file mode 100644 index 00000000..c957ce83 --- /dev/null +++ b/session/translation/noisy-hf-t5/prepare-data.ipynb @@ -0,0 +1,604 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# !wget https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/translation/iium-confession/part1.txt\n", + "# !wget https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/translation/iium-confession/part2.txt\n", + "# !wget https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/translation/iium-confession/part3.txt\n", + "# !wget https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/translation/iium-confession/part4.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# !cp ~/dev/malaya/misc/normalize-twitter/*.requested .\n", + "# !cp ~/iium-confession/*.requested .\n", + "# !cp ~/facebook/*.requested2 ." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# !cp ~/dev/malaya/misc/normalize-twitter/*.ms-requested .\n", + "# !cp ~/iium-confession/*.ms-requested .\n", + "# !cp ~/facebook/*.ms-requested ." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from glob import glob\n", + "import json\n", + "import re\n", + "import random\n", + "\n", + "def cleaning(string):\n", + " string = string.replace('\\n', ' ').replace('\\r', ' ')\n", + " return re.sub(r'[ ]+', ' ', string).strip()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "train = open('train.json', 'w')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "iium_crowd = glob('part*.txt')\n", + "for f in iium_crowd:\n", + " with open(f) as fopen:\n", + " for l in fopen:\n", + " try:\n", + " l, r = l.split('<>')\n", + " l = cleaning(l)\n", + " r = cleaning(r)\n", + " if len(l) and len(r):\n", + " d = {\"translation\": {\"src\": l, \"tgt\": r, 'prefix': 'terjemah Melayu ke Inggeris: '}}\n", + " train.write(f'{json.dumps(d)}\\n')\n", + " \n", + " d = {\"translation\": {\"src\": r, \"tgt\": l, 'prefix': 'terjemah Inggeris ke pasar Melayu: '}}\n", + " train.write(f'{json.dumps(d)}\\n')\n", + " except Exception as e:\n", + " # print(e)\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "with open('train-test-split.json') as fopen:\n", + " twitter_train_test = json.load(fopen)\n", + " twitters_test = twitter_train_test['test']\n", + " twitters_train = twitter_train_test['train']" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "with open('test.json', 'w') as fopen_jsonl:\n", + " for i in range(len(twitters_test)):\n", + " en = twitters_test[i][0]\n", + " tgt = twitters_test[i][1]\n", + " d = {\"translation\": {\"src\": en, \"tgt\": tgt, 'prefix': 'terjemah Melayu ke Inggeris: '}}\n", + " fopen_jsonl.write(f'{json.dumps(d)}\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(len(twitters_train)):\n", + " en = twitters_train[i][0]\n", + " tgt = twitters_train[i][1]\n", + " d = {\"translation\": {\"src\": en, \"tgt\": tgt, 'prefix': 'terjemah Melayu ke Inggeris: '}}\n", + " train.write(f'{json.dumps(d)}\\n')\n", + " \n", + " d = {\"translation\": {\"src\": tgt, \"tgt\": en, 'prefix': 'terjemah Inggeris ke pasar Melayu: '}}\n", + " train.write(f'{json.dumps(d)}\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"translation\": {\"src\": \"Just posted a photo @ Telok Ayer Tawar, Penang, Malaysia https://t.co/rFuiLzWEZ7\", \"tgt\": \"Just posted a photo @ Telok Ayer Tawar, Pulau Pinang, Malaysia https://t.co/rFuiLzWEZ7\", \"prefix\": \"terjemah Inggeris ke pasar Melayu: \"}}\r\n", + "{\"translation\": {\"src\": \"Asal SHAH ALAM still penuh Woi\", \"tgt\": \"The origin of SHAH ALAM is still full of Woi\", \"prefix\": \"terjemah Melayu ke Inggeris: \"}}\r\n", + "{\"translation\": {\"src\": \"The origin of SHAH ALAM is still full of Woi\", \"tgt\": \"Asal SHAH ALAM still penuh Woi\", \"prefix\": \"terjemah Inggeris ke pasar Melayu: \"}}\r\n" + ] + } + ], + "source": [ + "!tail -n 3 train.json" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "twitters = glob('*.normalized.jsonl.requested.ms-requested')\n", + "for f in twitters:\n", + " with open(f) as fopen:\n", + " for l in fopen:\n", + " try:\n", + " data = json.loads(l)\n", + " en = data['src']['r']['result']\n", + " ori = cleaning(data['src']['src']['original'])\n", + " norm = cleaning(data['src']['src']['normalized'])\n", + " tgt = cleaning(data['r']['result'])\n", + " \n", + " d = {\"translation\": {\"src\": ori, \"tgt\": tgt, 'prefix': 'terjemah pasar Melayu ke Melayu: '}}\n", + " train.write(f'{json.dumps(d)}\\n')\n", + " \n", + " d = {\"translation\": {\"src\": norm, \"tgt\": tgt, 'prefix': 'terjemah pasar Melayu ke Melayu: '}}\n", + " train.write(f'{json.dumps(d)}\\n')\n", + " \n", + " d = {\"translation\": {\"src\": tgt, \"tgt\": ori, 'prefix': 'terjemah Melayu ke pasar Melayu: '}}\n", + " train.write(f'{json.dumps(d)}\\n')\n", + "\n", + " except Exception as e:\n", + " print(e)\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"translation\": {\"src\": \"Jim kadar bayan daukarsa baiki ia pergi dan buro di, ya aksara datang kamfani dah halinsa ia bera. Matashi ya beca dah dan ada https://t.co/w6WyfWbFkv\", \"tgt\": \"Jim kadar bayan sakunsa baiki dia pergi na buro di, iya watak datang kompeni udah halinsa dia bera. Matashi ya beca dan ada https://t.co/w6WyfWbFkv\", \"prefix\": \"terjemah pasar Melayu ke Melayu: \"}}\r\n", + "{\"translation\": {\"src\": \"Jim kadar bayan sakunsa baiki dia pergi na buro di, iya watak datang kompeni udah halinsa dia bera. Matashi ya beca dan ada https://t.co/w6WyfWbFkv\", \"tgt\": \"Jim kadan bayan daukarsa aiki a gidan burodi, ya kassara mai kamfani da halinsa na bera. Matashi ya fece da kudade https://t.co/w6WyfWbFkv\", \"prefix\": \"terjemah Melayu ke pasar Melayu: \"}}\r\n", + "{\"translation\": {\"src\": \"Dia kat lane dalam tapi nk masuk kanan\", \"tgt\": \"Dia berada di lorong dalam tetapi mahu masuk ke sebelah kanan\", \"prefix\": \"terjemah pasar Melayu ke Melayu: \"}}\r\n" + ] + } + ], + "source": [ + "!tail -n 3 train.json" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "iiums = glob('clean-iium*.splitted.requested')\n", + "iiums.append('output.error.requested')\n", + "\n", + "for f in iiums:\n", + " with open(f) as fopen:\n", + " for l in fopen:\n", + " try:\n", + " data = json.loads(l)\n", + " l = cleaning(data['src'])\n", + " r = cleaning(data['r']['result'])\n", + " if len(l) and len(r):\n", + " d = {\"translation\": {\"src\": l, \"tgt\": r, 'prefix': 'terjemah Melayu ke Inggeris: '}}\n", + " train.write(f'{json.dumps(d)}\\n')\n", + " \n", + " d = {\"translation\": {\"src\": r, \"tgt\": l, 'prefix': 'terjemah Inggeris ke pasar Melayu: '}}\n", + " train.write(f'{json.dumps(d)}\\n')\n", + " except Exception as e:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"translation\": {\"src\": \"Jadi aku pun wasap Mak aku yang aku cintai malam tu jugak. Sambil-sambil tu layan lah makan western tu tapi.. dah x fikir dah nikmat makanan tu sbb fikir nk settle kan benda yang berserabut dalam kepala otak aku ni haa. So aku wasap mak aku betapa aku sangat x senang hidup macam ni, sebab aku memang niat datang ke kampus ini untuk belajar sebab aku berjaya dapat bidang yang minat so xkan nk sia-siakan rezeki yang Allah beri ni kan? Jadi aku pun menyusun ayat mengatakan aku ada crush pada seseorang ni, dan aku rasa aku mmg x mampu untuk go to the next level. So aku telah membuat suatu keputusan dimana aku yakin pada malam tu Allah adalah pemberi ilham yang terbaik untuk diri aku. Aku kata kat Mak aku \\\" Mak.. along rasa, along perlu buat keputusan ini dan along mintak maaf kalau perkara ini akan menyusahkan mak\\\" so mak aku pun balas pendek je.. \\\" ok ape dia?\\\"...\", \"tgt\": \"So I smoked my mom whom I love that night too. Meanwhile, I was eating the western food but.. I didn't think that the food was good because I was thinking about settling the things that are in my brain. So I told my mom how happy I am to live like this, because I really intend to come to this campus to study because I managed to get a field that interests me, so I don't want to waste the sustenance that God has given me, right? So I put together a sentence saying that I have a crush on this person, and I think I can't go to the next level. So I have made a decision where I believe that night God is the best inspiration for me. I said to my mom, \\\"Mom.. I feel like, I have to make this decision and I'm sorry if this will bother you\\\" so my mom gave me a short reply.. \\\"what's up with him?\\\"...\", \"prefix\": \"terjemah Melayu ke Inggeris: \"}}\r\n", + "{\"translation\": {\"src\": \"So I smoked my mom whom I love that night too. Meanwhile, I was eating the western food but.. I didn't think that the food was good because I was thinking about settling the things that are in my brain. So I told my mom how happy I am to live like this, because I really intend to come to this campus to study because I managed to get a field that interests me, so I don't want to waste the sustenance that God has given me, right? So I put together a sentence saying that I have a crush on this person, and I think I can't go to the next level. So I have made a decision where I believe that night God is the best inspiration for me. I said to my mom, \\\"Mom.. I feel like, I have to make this decision and I'm sorry if this will bother you\\\" so my mom gave me a short reply.. \\\"what's up with him?\\\"...\", \"tgt\": \"Jadi aku pun wasap Mak aku yang aku cintai malam tu jugak. Sambil-sambil tu layan lah makan western tu tapi.. dah x fikir dah nikmat makanan tu sbb fikir nk settle kan benda yang berserabut dalam kepala otak aku ni haa. So aku wasap mak aku betapa aku sangat x senang hidup macam ni, sebab aku memang niat datang ke kampus ini untuk belajar sebab aku berjaya dapat bidang yang minat so xkan nk sia-siakan rezeki yang Allah beri ni kan? Jadi aku pun menyusun ayat mengatakan aku ada crush pada seseorang ni, dan aku rasa aku mmg x mampu untuk go to the next level. So aku telah membuat suatu keputusan dimana aku yakin pada malam tu Allah adalah pemberi ilham yang terbaik untuk diri aku. Aku kata kat Mak aku \\\" Mak.. along rasa, along perlu buat keputusan ini dan along mintak maaf kalau perkara ini akan menyusahkan mak\\\" so mak aku pun balas pendek je.. \\\" ok ape dia?\\\"...\", \"prefix\": \"terjemah Inggeris ke pasar Melayu: \"}}\r\n", + "{\"translation\": {\"src\": \"biaselah orang lama mana rajin nk taip panjang2... so aku pun sambung.. \\\"mak, along rasa x mampu untuk hadapi benda ini dan along memang niat nk belajar sahaja disini. Jadi untuk mendamaikan hati yang berserabut ni.. along dah buat keputusan nk mintak tolong mak...\\\" x mampu beb nk lengkapkan ayat nk bagitau mak aku,sebab aku rasa keputusan yang aku buat ni dah macam bukan perkara normal (mungkin la). So mak aku pun balas lah.. \\\" ape dia? \\\" sambung...\\\" along nak mintak tolong mak untuk carikan jodoh along. Boleh x mak?\\\"... mak aku bluetick je bhai..xtau lah mak aku bincang dgn bapak aku ke..mak aku fikir dulu ke ape... so sambil tungu mak aku reply aku makan lah dulu makanan depan mata tu. Selang beberapa minit mak aku pun reply. \\\" boleh inshaAllah..\\\" ... jadi aku pun cakap lah terima kasih sangat2 mak kerana mak sanggup memikul satu perkara yang aku consider ia satu perkara yang besar dan membebankan dalam hidup..\", \"tgt\": \"It's normal for old people to type long... so I continued.. \\\"Mom, I feel like I can't deal with this and I really intend to just study here. So to reconcile this troubled heart.. I've made a decision I want to ask mom for help...\\\" I can't complete the sentence to tell my mom, because I feel like the decision I made is not normal (maybe it is). So my mother replied.. \\\"what is she?\\\" I continued...\\\" I want to ask my mother to help me find a match. Can I?\\\"... my mother just bluetick, bhai.. I want to talk to my father. I'm... mom, I'm thinking about what to do first... so while waiting, mom replied, I'll eat the food in front of my eyes first. After a few minutes, my mother replied. \\\" yes inshaAllah..\\\" ... so I said thank you very much mom for being willing to shoulder something that I consider to be a big and burdensome thing in life..\", \"prefix\": \"terjemah Melayu ke Inggeris: \"}}\r\n" + ] + } + ], + "source": [ + "!tail -n 3 train.json" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "iiums = glob('clean-iium*.splitted.requested.ms-requested')\n", + "iiums.append('output.error.requested.ms-requested')\n", + "\n", + "for f in iiums:\n", + " with open(f) as fopen:\n", + " for l in fopen:\n", + " try:\n", + " data = json.loads(l)\n", + " l = cleaning(data['src']['src'])\n", + " r = cleaning(data['r']['result'])\n", + " \n", + " if len(l) and len(r):\n", + " d = {\"translation\": {\"src\": l, \"tgt\": r, 'prefix': 'terjemah pasar Melayu ke Melayu: '}}\n", + " train.write(f'{json.dumps(d)}\\n')\n", + " \n", + " d = {\"translation\": {\"src\": r, \"tgt\": l, 'prefix': 'terjemah Melayu ke pasar Melayu: '}}\n", + " train.write(f'{json.dumps(d)}\\n')\n", + " except Exception as e:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"translation\": {\"src\": \"Jadi aku pun wasap Mak aku yang aku cintai malam tu jugak. Sambil-sambil tu layan lah makan western tu tapi.. dah x fikir dah nikmat makanan tu sbb fikir nk settle kan benda yang berserabut dalam kepala otak aku ni haa. So aku wasap mak aku betapa aku sangat x senang hidup macam ni, sebab aku memang niat datang ke kampus ini untuk belajar sebab aku berjaya dapat bidang yang minat so xkan nk sia-siakan rezeki yang Allah beri ni kan? Jadi aku pun menyusun ayat mengatakan aku ada crush pada seseorang ni, dan aku rasa aku mmg x mampu untuk go to the next level. So aku telah membuat suatu keputusan dimana aku yakin pada malam tu Allah adalah pemberi ilham yang terbaik untuk diri aku. Aku kata kat Mak aku \\\" Mak.. along rasa, along perlu buat keputusan ini dan along mintak maaf kalau perkara ini akan menyusahkan mak\\\" so mak aku pun balas pendek je.. \\\" ok ape dia?\\\"...\", \"tgt\": \"Jadi saya merokok ibu saya yang saya sayangi malam itu juga. Sambil-sambil tu tengah makan western food tapi.. tak sangka makanan tu sedap sebab tengah fikir nak settlekan benda yang ada dalam otak ni. Jadi saya beritahu mak saya betapa gembiranya saya hidup begini, kerana saya memang berniat untuk datang ke kampus ini untuk belajar kerana saya berjaya mendapat bidang yang saya minati, jadi saya tidak mahu mensia-siakan rezeki yang Allah berikan. saya, kan? Jadi saya menyusun ayat yang mengatakan bahawa saya menyukai orang ini, dan saya rasa saya tidak boleh pergi ke peringkat seterusnya. Jadi saya telah membuat keputusan di mana saya percaya malam itu Tuhan adalah inspirasi terbaik untuk saya. Saya berkata kepada ibu saya, \\\"Ibu.. saya rasa, saya perlu membuat keputusan ini dan saya minta maaf jika ini akan mengganggu ibu saya\\\" jadi ibu saya memberi saya jawapan ringkas.. \\\"apa masalah dengan dia?\\\".. .\", \"prefix\": \"terjemah pasar Melayu ke Melayu: \"}}\r\n", + "{\"translation\": {\"src\": \"Jadi saya merokok ibu saya yang saya sayangi malam itu juga. Sambil-sambil tu tengah makan western food tapi.. tak sangka makanan tu sedap sebab tengah fikir nak settlekan benda yang ada dalam otak ni. Jadi saya beritahu mak saya betapa gembiranya saya hidup begini, kerana saya memang berniat untuk datang ke kampus ini untuk belajar kerana saya berjaya mendapat bidang yang saya minati, jadi saya tidak mahu mensia-siakan rezeki yang Allah berikan. saya, kan? Jadi saya menyusun ayat yang mengatakan bahawa saya menyukai orang ini, dan saya rasa saya tidak boleh pergi ke peringkat seterusnya. Jadi saya telah membuat keputusan di mana saya percaya malam itu Tuhan adalah inspirasi terbaik untuk saya. Saya berkata kepada ibu saya, \\\"Ibu.. saya rasa, saya perlu membuat keputusan ini dan saya minta maaf jika ini akan mengganggu ibu saya\\\" jadi ibu saya memberi saya jawapan ringkas.. \\\"apa masalah dengan dia?\\\".. .\", \"tgt\": \"Jadi aku pun wasap Mak aku yang aku cintai malam tu jugak. Sambil-sambil tu layan lah makan western tu tapi.. dah x fikir dah nikmat makanan tu sbb fikir nk settle kan benda yang berserabut dalam kepala otak aku ni haa. So aku wasap mak aku betapa aku sangat x senang hidup macam ni, sebab aku memang niat datang ke kampus ini untuk belajar sebab aku berjaya dapat bidang yang minat so xkan nk sia-siakan rezeki yang Allah beri ni kan? Jadi aku pun menyusun ayat mengatakan aku ada crush pada seseorang ni, dan aku rasa aku mmg x mampu untuk go to the next level. So aku telah membuat suatu keputusan dimana aku yakin pada malam tu Allah adalah pemberi ilham yang terbaik untuk diri aku. Aku kata kat Mak aku \\\" Mak.. along rasa, along perlu buat keputusan ini dan along mintak maaf kalau perkara ini akan menyusahkan mak\\\" so mak aku pun balas pendek je.. \\\" ok ape dia?\\\"...\", \"prefix\": \"terjemah Melayu ke pasar Melayu: \"}}\r\n" + ] + } + ], + "source": [ + "!tail -n 2 train.json" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['facebook-data.jsonl00.splitted.requested2',\n", + " 'facebook-data-v2.jsonl01.splitted.requested2',\n", + " 'filtered-common-crawl02.splitted.requested2',\n", + " 'facebook-data.jsonl02.splitted.requested2',\n", + " 'facebook-data.jsonl03.splitted.requested2',\n", + " 'facebook-data-v2.jsonl00.splitted.requested2',\n", + " 'facebook-data-v2.jsonl02.splitted.requested2',\n", + " 'facebook-data.jsonl01.splitted.requested2',\n", + " 'facebook-data-v2.jsonl03.splitted.requested2']" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "facebooks = glob('*.splitted.requested2')\n", + "facebooks" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "for f in facebooks:\n", + " with open(f) as fopen:\n", + " for l in fopen:\n", + " try:\n", + " data = json.loads(l)\n", + " l = cleaning(data['src']['text'])\n", + " r = cleaning(data['r']['result'])\n", + " l_ = cleaning(data['src']['original'])\n", + " if len(l) and len(r):\n", + " d = {\"translation\": {\"src\": l, \"tgt\": r, 'prefix': 'terjemah Melayu ke Inggeris: '}}\n", + " train.write(f'{json.dumps(d)}\\n')\n", + " \n", + " d = {\"translation\": {\"src\": r, \"tgt\": l, 'prefix': 'terjemah Inggeris ke pasar Melayu: '}}\n", + " train.write(f'{json.dumps(d)}\\n')\n", + " \n", + " if len(l_) and len(r):\n", + " d = {\"translation\": {\"src\": l_, \"tgt\": r, 'prefix': 'terjemah Melayu ke Inggeris: '}}\n", + " train.write(f'{json.dumps(d)}\\n')\n", + " \n", + " d = {\"translation\": {\"src\": r, \"tgt\": l_, 'prefix': 'terjemah Inggeris ke pasar Melayu: '}}\n", + " train.write(f'{json.dumps(d)}\\n')\n", + " except Exception as e:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"translation\": {\"src\": \"GPS NATIONAL LINE\", \"tgt\": \"BN GPS\", \"prefix\": \"terjemah Inggeris ke pasar Melayu: \"}}\r\n", + "{\"translation\": {\"src\": \"Sy pun geng 2hb. lejar masih 2020\", \"tgt\": \"I'm also in the 2nd gang. ledger is still 2020\", \"prefix\": \"terjemah Melayu ke Inggeris: \"}}\r\n" + ] + } + ], + "source": [ + "!tail -n 2 train.json" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['facebook-data-v2.jsonl01.splitted.requested2.ms-requested',\n", + " 'facebook-data-v2.jsonl01.splitted.requested.ms-requested',\n", + " 'facebook-data-v2.jsonl02.splitted.requested2.ms-requested',\n", + " 'facebook-data.jsonl02.splitted.requested2.ms-requested',\n", + " 'facebook-data-v2.jsonl00.splitted.requested2.ms-requested',\n", + " 'facebook-data.jsonl03.splitted.requested2.ms-requested',\n", + " 'facebook-data-v2.jsonl03.splitted.requested2.ms-requested',\n", + " 'facebook-data.jsonl01.splitted.requested2.ms-requested',\n", + " 'facebook-data.jsonl00.splitted.requested2.ms-requested']" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "facebooks = glob('*facebook*.ms-requested')\n", + "facebooks" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "for f in facebooks:\n", + " with open(f) as fopen:\n", + " for l in fopen:\n", + " try:\n", + " data = json.loads(l)\n", + " l = cleaning(data['src']['src']['original'])\n", + " l_ = cleaning(data['src']['src']['text'])\n", + " r = cleaning(data['r']['result'])\n", + "\n", + " if len(l) and len(r):\n", + " d = {\"translation\": {\"src\": l, \"tgt\": r, 'prefix': 'terjemah pasar Melayu ke Melayu: '}}\n", + " train.write(f'{json.dumps(d)}\\n')\n", + " \n", + " d = {\"translation\": {\"src\": r, \"tgt\": l, 'prefix': 'terjemah Melayu ke pasar Melayu: '}}\n", + " train.write(f'{json.dumps(d)}\\n')\n", + " \n", + " if len(l_) and len(r):\n", + " d = {\"translation\": {\"src\": l_, \"tgt\": r, 'prefix': 'terjemah pasar Melayu ke Melayu: '}}\n", + " train.write(f'{json.dumps(d)}\\n')\n", + " except Exception as e:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"translation\": {\"src\": \"Nasib baik 22 bulan Kalau 5 tahun dah jadi melayu Singapore\", \"tgt\": \"Nasib baik 22 bulan Kalau 5 tahun dah jadi Melayu Singapura\", \"prefix\": \"terjemah pasar Melayu ke Melayu: \"}}\r\n", + "{\"translation\": {\"src\": \"Saya x minta byk DSAi ..setiap jawatan adalah amanah .apa peekara yang boleh membawa kebaikkan ..teruskan DSAi ..lpakn mulut orang..bila kita buat kerana Allah SWT, Allah SWT akan tolong hambanya. Teruskan membuat kebaikkan .\", \"tgt\": \"Saya tak minta banyak DSAI. . setiap jawatan adalah amanah. apa yang boleh mendatangkan kebaikan. . teruskan DSAI. . apa salahnya mulut orang kalau kita buat kerana Allah SWT, Allah SWT akan bantu hambanya. teruskan berbuat baik.\", \"prefix\": \"terjemah pasar Melayu ke Melayu: \"}}\r\n", + "{\"translation\": {\"src\": \"Saya tak minta banyak DSAI. . setiap jawatan adalah amanah. apa yang boleh mendatangkan kebaikan. . teruskan DSAI. . apa salahnya mulut orang kalau kita buat kerana Allah SWT, Allah SWT akan bantu hambanya. teruskan berbuat baik.\", \"tgt\": \"Saya x minta byk DSAi ..setiap jawatan adalah amanah .apa peekara yang boleh membawa kebaikkan ..teruskan DSAi ..lpakn mulut orang..bila kita buat kerana Allah SWT, Allah SWT akan tolong hambanya. Teruskan membuat kebaikkan .\", \"prefix\": \"terjemah Melayu ke pasar Melayu: \"}}\r\n" + ] + } + ], + "source": [ + "!tail -n 3 train.json" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "common_crawls = glob('filtered-common-crawl*')\n", + "\n", + "for f in common_crawls:\n", + " with open(f) as fopen:\n", + " for l in fopen:\n", + " try:\n", + " data = json.loads(l)\n", + " l = cleaning(data['src'])\n", + " r = cleaning(data['r']['result'])\n", + " if len(l) and len(r):\n", + " d = {\"translation\": {\"src\": l, \"tgt\": r, 'prefix': 'terjemah Melayu ke Inggeris: '}}\n", + " train.write(f'{json.dumps(d)}\\n')\n", + " \n", + " d = {\"translation\": {\"src\": r, \"tgt\": l, 'prefix': 'terjemah Inggeris ke pasar Melayu: '}}\n", + " train.write(f'{json.dumps(d)}\\n')\n", + " except Exception as e:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "train.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "8635532it [00:36, 235369.72it/s]\n" + ] + } + ], + "source": [ + "from tqdm import tqdm\n", + "\n", + "with open('filtered-train.json', 'w') as train:\n", + " with open('train.json') as fopen:\n", + " for l in tqdm(fopen):\n", + " data = json.loads(l)\n", + " src = data['translation']['src']\n", + " tgt = data['translation']['tgt']\n", + " if ('promo' in src and 'firm' in tgt) or ('firm' in src and 'promo' in tgt):\n", + " continue\n", + " train.write(f'{json.dumps(data)}\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8634891 filtered-train.json\r\n" + ] + } + ], + "source": [ + "!wc -l filtered-train.json" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "!shuf filtered-train.json > shuffled-train.json" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "!shuf test.json > shuffled-test.json" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "8634891it [00:03, 2495153.17it/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "2112709" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = []\n", + "\n", + "with open('filtered-train.json') as fopen:\n", + " for l in tqdm(fopen):\n", + " if 'terjemah pasar Melayu ke Melayu' in l:\n", + " data.append(l)\n", + " \n", + "len(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}