{ "cells": [ { "source": [ "## Poorly cleaned and documented notebook for evaluating text generation with BLEU and BERTScore" ], "cell_type": "markdown", "metadata": {} }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset, concatenate_datasets\n", "import transformers\n", "from transformers import (\n", " Trainer,\n", " TrainingArguments,\n", " default_data_collator,\n", " AutoModelForCausalLM,\n", " AutoModelForSequenceClassification,\n", " PreTrainedTokenizerFast,\n", " AutoModelWithLMHead,\n", " AutoConfig,\n", " AutoModel,\n", " AutoTokenizer,\n", " GPT2TokenizerFast,\n", " GPT2Config\n", ")\n", "import datasets\n", "import torch\n", "import numpy as np\n", "import os\n", "from tokenizers.processors import TemplateProcessing\n", "\n", "from evaluate import load\n", "\n", "\"\"\"\n", "model_type = \"custom\"\n", "if model_type == \"large\":\n", " tokenizer = AutoTokenizer.from_pretrained(\"H:\\\\Data_temp\\\\checkpoints\\\\large\\\\checkpoint-12200\")\n", " model = AutoModelForCausalLM.from_pretrained(\"H:\\\\Data_temp\\\\checkpoints\\\\large\\\\checkpoint-12200\").to(\"cuda\")\n", "elif model_type == \"small\":\n", " tokenizer = AutoTokenizer.from_pretrained(r\"H:\\Data_temp\\checkpoints\\small\\checkpoint-140000\")\n", " model = AutoModelForCausalLM.from_pretrained(r\"H:\\Data_temp\\checkpoints\\small\\checkpoint-140000\").to(\"cuda\")\n", "elif model_type == \"custom\":\n", " tokenizer = GPT2TokenizerFast.from_pretrained('Finnish-NLP/gpt2-large-finnish')\n", " model = AutoModelForCausalLM.from_pretrained('Finnish-NLP/gpt2-large-finnish').to(\"cuda\")\n", "elif model_type == \"distill\":\n", " config = GPT2Config.from_pretrained(r\"H:\\Data_temp\\checkpoints\\distillation\\third\\config.json\")\n", " tokenizer = AutoTokenizer.from_pretrained(r\"H:\\Data_temp\\checkpoints\\large\\checkpoint-12200\")\n", " model = AutoModelForCausalLM.from_pretrained(r\"H:\\Data_temp\\checkpoints\\distillation\\third\\model_step_640000.pth\",config=config).to(\"cuda\")\n", "\"\"\"\n", "bleu = load(\"bleu\")\n", "bertscore = load(\"bertscore\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using custom data configuration Finnish-NLP--mc4_fi_cleaned-c578f3a358717f8c\n", "Reusing dataset csv (H:\\Data_temp\\cache\\Finnish-NLP___csv\\Finnish-NLP--mc4_fi_cleaned-c578f3a358717f8c\\0.0.0\\652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)\n" ] }, { "data": { "text/plain": [ "Dataset({\n", " features: ['text', 'perplexity_kenlm_full'],\n", " num_rows: 17877\n", "})" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#input_dir = \"H:\\\\Data_temp\\\\1024_bpe_dataset2\"\n", "#dataset = datasets.load_from_disk(input_dir)[\"test\"]\n", "#dataset\n", "dataset = datasets.load_dataset(\"Finnish-NLP/mc4_fi_cleaned\", split=\"validation\").remove_columns([\"timestamp\",\"url\"])\n", "dataset = dataset.filter(lambda e: len(e[\"text\"]) > 200, num_proc=12, load_from_cache_file=False, writer_batch_size=100000)\n", "dataset" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'precision': [1.0, 0.9999998807907104],\n", " 'recall': [1.0, 0.9999998807907104],\n", " 'f1': [1.0, 0.9999998807907104],\n", " 'hashcode': 'bert-base-multilingual-cased_L9_no-idf_version=0.3.11(hug_trans=4.15.0)'}" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "predictions = [\"hello there\", \"general kenobi\"]\n", "references = [\"hello there\", \"general kenobi\"]\n", "results = bertscore.compute(predictions=predictions, references=references, lang=\"fi\")\n", "results" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 200/200 [06:53<00:00, 2.07s/it]\n" ] }, { "data": { "text/plain": [ "(0.6634505313245496,\n", " 0.6440643028697776,\n", " 0.6530068675477301,\n", " {'bleu': 0.055396740403126914,\n", " 'precisions': [0.1908035441490987,\n", " 0.040655531043176804,\n", " 0.022608978529603124],\n", " 'brevity_penalty': 0.9896657459467438,\n", " 'length_ratio': 0.9897187783489567,\n", " 'translation_length': 6546,\n", " 'reference_length': 6614})" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(r\"H:\\Data_temp\\checkpoints\\large\\checkpoint-31500\")\n", "model = AutoModelForCausalLM.from_pretrained(r\"H:\\Data_temp\\checkpoints\\large\\checkpoint-31500\").to(\"cuda\")\n", "refs = []\n", "predictions = []\n", "from tqdm import tqdm\n", "for i in tqdm(range(0,200)):\n", " input_text = dataset[i][\"text\"] \n", " ids = tokenizer.encode(input_text, add_special_tokens=False, return_tensors=\"pt\").to(\"cuda\")\n", " generation_len = 50\n", " if ids.shape[1] < generation_len*2:\n", " first = ids[:,:(ids.shape[1]//2)]\n", " second = ids[:,(ids.shape[1]//2):]\n", " else:\n", " cutoff = min(300,ids.shape[1]-generation_len)\n", " first = ids[:,:cutoff]\n", " second = ids[:,cutoff:cutoff+generation_len]\n", " #print(tokenizer.decode(,skip_special_tokens=True, clean_up_tokenization_spaces=True))\n", " text_in = tokenizer.decode(first[0],skip_special_tokens=True, clean_up_tokenization_spaces=True)\n", " ref = tokenizer.decode(second[0],skip_special_tokens=True, clean_up_tokenization_spaces=True)\n", " prompt_len = len(text_in)\n", " model_out = model.generate(first, max_length=first.shape[1]+second.shape[1], do_sample=True, top_p=0.9, top_k=20, temperature=0.9, pad_token_id=tokenizer.eos_token_id)\n", "\n", "\n", " text_out = tokenizer.decode(model_out[0],skip_special_tokens=True, clean_up_tokenization_spaces=True)[prompt_len:]\n", " refs.append([ref])\n", " predictions.append(text_out)\n", " \n", "#len(tokenizer.decode(first,skip_special_tokens=True, clean_up_tokenization_spaces=True))\n", "len(refs[:-1]), len(predictions[:-1])\n", "bs = bertscore.compute(predictions=predictions[:-1], references=refs[:-1], lang=\"fi\")\n", "np.mean(bs[\"precision\"]), np.mean(bs[\"recall\"]),np.mean(bs[\"f1\"]),bleu.compute(predictions=predictions, max_order = 3, references=refs)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 5000/5000 [2:07:15<00:00, 1.53s/it] \n" ] }, { "data": { "text/plain": [ "(0.6666507967282925,\n", " 0.6589269880879806,\n", " 0.6626176441889043,\n", " {'bleu': 0.0670418098328639,\n", " 'precisions': [0.20027962065152674,\n", " 0.04913386342784744,\n", " 0.030621010257533138],\n", " 'brevity_penalty': 1.0,\n", " 'length_ratio': 1.0874531748601026,\n", " 'translation_length': 188112,\n", " 'reference_length': 172984})" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "tokenizer = GPT2TokenizerFast.from_pretrained('Finnish-NLP/gpt2-large-finnish')\n", "model = AutoModelForCausalLM.from_pretrained('Finnish-NLP/gpt2-large-finnish').to(\"cuda\")\n", "refs = []\n", "predictions = []\n", "from tqdm import tqdm\n", "for i in tqdm(range(0,5000)):\n", " input_text = dataset[i][\"text\"] \n", " ids = tokenizer.encode(input_text, add_special_tokens=False, return_tensors=\"pt\").to(\"cuda\")\n", " generation_len = 50\n", " if ids.shape[1] < generation_len*2:\n", " first = ids[:,:(ids.shape[1]//2)]\n", " second = ids[:,(ids.shape[1]//2):]\n", " else:\n", " cutoff = min(300,ids.shape[1]-generation_len)\n", " first = ids[:,:cutoff]\n", " second = ids[:,cutoff:cutoff+generation_len]\n", " #print(tokenizer.decode(,skip_special_tokens=True, clean_up_tokenization_spaces=True))\n", " text_in = tokenizer.decode(first[0],skip_special_tokens=True, clean_up_tokenization_spaces=True)\n", " ref = tokenizer.decode(second[0],skip_special_tokens=True, clean_up_tokenization_spaces=True)\n", " prompt_len = len(text_in)\n", " model_out = model.generate(first, max_length=first.shape[1]+second.shape[1], do_sample=True, top_p=0.9, top_k=20, temperature=0.9, pad_token_id=tokenizer.eos_token_id)\n", "\n", "\n", " text_out = tokenizer.decode(model_out[0],skip_special_tokens=True, clean_up_tokenization_spaces=True)[prompt_len:]\n", " refs.append([ref])\n", " predictions.append(text_out)\n", " \n", "#len(tokenizer.decode(first,skip_special_tokens=True, clean_up_tokenization_spaces=True))\n", "len(refs[:-1]), len(predictions[:-1])\n", "bs = bertscore.compute(predictions=predictions[:-1], references=refs[:-1], lang=\"fi\")\n", "np.mean(bs[\"precision\"]), np.mean(bs[\"recall\"]),np.mean(bs[\"f1\"]),bleu.compute(predictions=predictions, max_order = 3, references=refs)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 0%| | 0/5000 [00:00\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdataset\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mselect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"text\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m: 'list' object has no attribute 'shape'" ] } ], "source": [ "dataset.select(range(10))[\"text\"].shape" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(dataset.select(range(10))[\"text\"])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([1, 73096])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "token_list = []\n", "for i in range(100):\n", " input_item = dataset[i]\n", " tensor = tokenizer.encode(input_item[\"text\"], add_special_tokens=True, return_tensors=\"pt\").to(\"cuda\") \n", " token_list.append(tensor)\n", "encodings = torch.cat(token_list,1)\n", "encodings.shape" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(tensor(4.8928, device='cuda:0'),\n", " array([[11344, 359, 863, ..., 8721, 313, 12]], dtype=int64))" ] }, "execution_count": 115, "metadata": {}, "output_type": "execute_result" } ], "source": [ "begin_loc = 100\n", "seq_len = encodings.size(1)\n", "end_loc = min(begin_loc + max_length, seq_len)\n", "prev_end_loc = 0\n", "trg_len = end_loc - prev_end_loc # may be different from stride on last loop\n", "input_ids = encodings[:, begin_loc:end_loc]\n", "target_ids = input_ids.clone()\n", "target_ids[:, :-trg_len] = -100\n", "with torch.no_grad():\n", " outputs = model(input_ids, labels=target_ids)\n", "outputs.loss, (encodings[:, begin_loc:end_loc].cpu().numpy())" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 97%|█████████▋| 1421/1462 [08:08<00:14, 2.91it/s, loss=6.57]\n" ] }, { "data": { "text/plain": [ "(6.566047787444524, 710.5560179623819)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from tqdm import tqdm\n", "\n", "max_length = model.config.n_positions\n", "stride = 50\n", "\n", "#input_text = dataset[i][\"text\"] \n", "#encodings = tokenizer.encode(input_text, add_special_tokens=False, return_tensors=\"pt\").to(\"cuda\")\n", "#device = \"cuda\"\n", "seq_len = encodings.size(1)\n", "\n", "prev_end_loc = 0\n", "count_sum = 0\n", "nlls = []\n", "pbar = tqdm(range(0, seq_len, stride))\n", "for begin_loc in pbar:\n", " end_loc = min(begin_loc + max_length, seq_len)\n", " trg_len = end_loc - prev_end_loc # may be different from stride on last loop\n", " input_ids = encodings[:, begin_loc:end_loc]\n", " target_ids = input_ids.clone()\n", " target_ids[:, :-trg_len] = -100\n", "\n", " with torch.no_grad():\n", " outputs = model(input_ids, labels=target_ids)\n", "\n", " # loss is calculated using CrossEntropyLoss which averages over input tokens.\n", " # Multiply it with trg_len to get the summation instead of average.\n", " # We will take average over all the tokens to get the true average\n", " # in the last step of this example.\n", " neg_log_likelihood = outputs.loss * trg_len\n", "\n", " nlls.append(neg_log_likelihood.cpu().numpy())\n", "\n", " prev_end_loc = end_loc\n", " pbar.set_postfix({\"loss\":(np.array(nlls[1:])/end_loc).sum()})\n", " if end_loc == seq_len:\n", " count_sum += end_loc\n", " break\n", "\n", "\n", "#ppl = torch.exp((np.array(nlls)/end_loc).sum())\n", "loss = (np.array(nlls[1:])/end_loc).sum()\n", "(loss, np.exp(loss))" ] }, { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "7.0311055" ] }, "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(np.array(nlls[1:])/50).mean()" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5.917751879220825, 371.5754277521345)" ] }, "execution_count": 119, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(loss, np.exp(loss))" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(3.0430849, 20.969833)" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(np.array(nlls[1:])/end_loc).sum(), np.exp((np.array(nlls[1:])/end_loc).sum())" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "349.2629" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.exp((np.array(nlls)/end_loc).sum())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'list' object has no attribute 'cpu'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexp\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstack\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnlls\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m/\u001b[0m \u001b[0mend_loc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexp\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstack\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnlls\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcpu\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m/\u001b[0m\u001b[0mend_loc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m: 'list' object has no attribute 'cpu'" ] } ], "source": [ "torch.exp(torch.stack(nlls).sum() / end_loc),torch.exp(torch.stack(np.array(nlls)/end_loc).sum())" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "270" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "count_sum" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor(30.6271, device='cuda:0')" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ppl = torch.exp(torch.stack(nlls).sum() / count_sum)\n", "ppl" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([1, 125, 50000])" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "outputs.keys()\n", "outputs.logits.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors=\"pt\")\n", "prompt_len = len(tokenizer.decode(inputs[0],skip_special_tokens=True, clean_up_tokenization_spaces=True))\n", "outputs = model.generate(inputs, max_length=len(inputs[0])+50, do_sample=True, top_p=0.98, top_k=50)\n", "tokenizer.decode(outputs[0])[prompt_len:].replace(\" ##\",\"\").replace(\"<|endoftext|>\",\"\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "512" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(ids)//2" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ids = dataset[\"test\"].select(range(100))[\"input_ids\"]" ] } ], "metadata": { "interpreter": { "hash": "1d9050d93d93b71fa3edc5938291757e7480975ed666173bb85be41dbf084556" }, "kernelspec": { "display_name": "Python 3.9.4 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.4" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }