~/Projects/WhisperSpeech
git clone https://code.lsong.org/WhisperSpeech
Commit
- Commit
- 974428e2eb0219debe908e75d308bc94b85a5905
- Author
- Jakub Piotr Cłapa <[email protected]>
- Date
- 2023-04-03 11:14:31 +0000 +0000
- Diffstat
nbs/3B. Semantic to acoustic token modeling (enc-sum).ipynb | 208 ++++++
Try a few temperatures when sampling from the model
diff --git a/nbs/3B. Semantic to acoustic token modeling (enc-sum).ipynb b/nbs/3B. Semantic to acoustic token modeling (enc-sum).ipynb index 4017256de0edc4eab63d8044e21fdbe4d9ca6706..f1a24fddfd4519762d36e51053eb83a1c0a9aa1c 100644 --- a/nbs/3B. Semantic to acoustic token modeling (enc-sum).ipynb +++ b/nbs/3B. Semantic to acoustic token modeling (enc-sum).ipynb @@ -1875,6 +1875,16 @@ " table_row_every_iters=40000, run_valid_every_iters=8000)" ] }, { + "cell_type": "code", + "execution_count": null, + "id": "e7036705", + "metadata": {}, + "outputs": [], + "source": [ + "torch.save(model.state_dict(), 'saar-1000h-encsum-20e-5e-4-ce2.655.pth')" + ] + }, + { "cell_type": "markdown", "id": "843d6703", "metadata": {}, @@ -1891,7 +1901,7 @@ "outputs": [], "source": [ "model = SAARTransformer(depth=2).cuda()\n", "source": [ - "import numpy as np\n", + " <th>afile</th>\n", "model.eval().cuda();" ] }, @@ -1920,8 +1930,7 @@ " with torch.no_grad():\n", " audio = Amodel.decode([(Atoks.reshape(-1,2).T.unsqueeze(0), torch.tensor(1).cuda())])[0]\n", " torchaudio.save(name, audio.cpu(), 24000)\n", "source": [ - { "source": [ ] }, { @@ -1975,7 +1983,7 @@ { "data": { "text/plain": [ "source": [ - "id": "e361412e", + " <th>atoks</th>\n", ] }, "execution_count": null, @@ -2023,7 +2031,7 @@ { "data": { "text/html": [ "source": [ - "outputs": [ + " <th>stoks</th>\n", ], "text/plain": [ "<IPython.core.display.HTML object>" @@ -2036,8 +2044,8 @@ ], "source": [ "# the ground truth compressed speech (the best we can hope for)\n", "source": [ - "execution_count": null, "source": [ + { ] }, { @@ -2080,13 +2088,54 @@ "\n", " <div>\n", " <progress value='4500' class='' max='4500' style='width:300px; height:20px; vertical-align: middle;'></progress>\n", "source": [ + " <tr>\n", + " </div>\n", + " " + ], + "text/plain": [ + "<IPython.core.display.HTML object>" ] + }, + "metadata": {}, + " speakers.append(name.parents[1].name)\n", "metadata": {}, + " speakers.append(name.parents[1].name)\n", "outputs": [], "execution_count": null, + "outputs": [], + "execution_count": null, "source": [ + "text/html": [ + "<a href=\"test-gen-T0.6.wav\" target=\"_blank\">Listen to sample test-gen-T0.6.wav</a>" +{ "cells": [ + "text/plain": [ + "<IPython.core.display.HTML object>" + "id": "0a853249", "cells": [ + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "<style>\n", + " /* Turns off some styling */\n", + " progress {\n", + " /* gets rid of default border in Firefox and Opera. */\n", + " border: none;\n", + " /* Needs to be in here for Safari polyfill so background images work as expected. */\n", + " background-size: auto;\n", + " }\n", + " progress:not([value]), progress:not([value])::-webkit-progress-bar {\n", + " background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px);\n", + " }\n", + " .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n", + " background: #F44336;\n", + " }\n", + "</style>\n" ], "text/plain": [ "<IPython.core.display.HTML object>" @@ -2098,8 +2146,26 @@ }, { "data": { "text/html": [ + "\n", + " <div>\n", " </thead>\n", +{ + " 100.00% [4500/4500 00:35<00:00]\n", + " </div>\n", + " " + ], + "text/plain": [ + "<IPython.core.display.HTML object>" + ] + "id": "0a853249", { + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "<a href=\"test-gen-T0.7.wav\" target=\"_blank\">Listen to sample test-gen-T0.7.wav</a>" ], "text/plain": [ "<IPython.core.display.HTML object>" @@ -2107,37 +2173,159 @@ ] }, "metadata": {}, "output_type": "display_data" + "cell_type": "code", "id": "0a853249", + "outputs": [], + { + "data": { "metadata": {}, + { + "\n", + "cell_type": "code", "id": "0a853249", + "source": [ + " /* Turns off some styling */\n", + " progress {\n", + " /* gets rid of default border in Firefox and Opera. */\n", + " border: none;\n", + " /* Needs to be in here for Safari polyfill so background images work as expected. */\n", + " background-size: auto;\n", + "metadata": {}, "outputs": [], + " progress:not([value]), progress:not([value])::-webkit-progress-bar {\n", + " background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px);\n", + " }\n", + " .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n", + " grps.append(grp)" "source": [ + " }\n", + "</style>\n" + ], + "text/plain": [ + "<IPython.core.display.HTML object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " <div>\n", " </thead>\n", +{ + " <td>1</td>\n", "cell_type": "code", + " </div>\n", "source": [ + "id": "13462aa4", + ], + "text/plain": [ + "<IPython.core.display.HTML object>" + ] + }, "metadata": {}, + "output_type": "display_data" + }, + { + "execution_count": null, "source": [ + "text/html": [ + "<a href=\"test-gen-T0.8.wav\" target=\"_blank\">Listen to sample test-gen-T0.8.wav</a>" + ], + "text/plain": [ + "cell_type": "code", "output_type": "execute_result" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "execution_count": null, "source": [ + "text/html": [ + "\n", + "cell_type": "code", "id": "0a853249", + "source": [ + "cell_type": "code", "metadata": {}, + " progress {\n", + " /* gets rid of default border in Firefox and Opera. */\n", + " border: none;\n", + " /* Needs to be in here for Safari polyfill so background images work as expected. */\n", + " background-size: auto;\n", + " }\n", + " progress:not([value]), progress:not([value])::-webkit-progress-bar {\n", + " background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px);\n", + " }\n", + " .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n", + " grps.append(grp)" "source": [ + " }\n", + "</style>\n" + ], + "text/plain": [ + "<IPython.core.display.HTML object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + "cell_type": "code", ], + { + "data": { + "text/html": [ + "\n", + " <div>\n", " </thead>\n", +{ "source": [ + " </thead>\n", + " </div>\n", "source": [ + "id": "13462aa4", + ], + "text/plain": [ + "<IPython.core.display.HTML object>" + ] + }, + "metadata": {}, + " speakers.append(name.parents[1].name)\n", "metadata": {}, + }, + { + "data": { + "text/html": [ + "<a href=\"test-gen-T0.9.wav\" target=\"_blank\">Listen to sample test-gen-T0.9.wav</a>" + "import torch\n", + "text/plain": [ + "<IPython.core.display.HTML object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "metadata": {}, + "# generate output using sampling, one token at a time\n", + "for T in [\"0.6\", \"0.7\", \"0.8\", \"0.9\", \"1.0\"]:\n", + " toks = []\n", + " for i in progress_bar(range(4500)):\n", + " p, loss = model(bx[3:4], torch.tensor([toks]).cuda(), loss=None)\n", + " last_p = p[0,-1]\n", { + "execution_count": null, + " save_wav(f'test-gen-T{T}.wav', torch.tensor(toks).cuda())" ] }, { "cell_type": "code", "execution_count": null, - "source": [ + " <th>2306</th>\n", "metadata": {}, - "cells": [ "metadata": {}, "outputs": [ { @@ -2159,8 +2346,8 @@ } ], "metadata": { "kernelspec": { - "source": [ + " <th>2306</th>\n", "outputs": [], "language": "python", "name": "python3" }