Liu Song’s Projects

~/Projects/WhisperSpeech

git clone https://code.lsong.org/WhisperSpeech

Commit

Commit

974428e2eb0219debe908e75d308bc94b85a5905

Author

Jakub Piotr Cłapa <[email protected]>

Date

2023-04-03 11:14:31 +0000 +0000

Diffstat

 nbs/3B. Semantic to acoustic token modeling (enc-sum).ipynb | 208 ++++++

Try a few temperatures when sampling from the model

diff --git a/nbs/3B. Semantic to acoustic token modeling (enc-sum).ipynb b/nbs/3B. Semantic to acoustic token modeling (enc-sum).ipynb
index 4017256de0edc4eab63d8044e21fdbe4d9ca6706..f1a24fddfd4519762d36e51053eb83a1c0a9aa1c 100644
--- a/nbs/3B. Semantic to acoustic token modeling (enc-sum).ipynb
+++ b/nbs/3B. Semantic to acoustic token modeling (enc-sum).ipynb
@@ -1875,6 +1875,16 @@     "          table_row_every_iters=40000, run_valid_every_iters=8000)"
    ]
   },
   {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e7036705",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.save(model.state_dict(), 'saar-1000h-encsum-20e-5e-4-ce2.655.pth')"
+   ]
+  },
+  {
    "cell_type": "markdown",
    "id": "843d6703",
    "metadata": {},
@@ -1891,7 +1901,7 @@    "outputs": [],
    "source": [
     "model = SAARTransformer(depth=2).cuda()\n",
    "source": [
-    "import numpy as np\n",
+       "      <th>afile</th>\n",
     "model.eval().cuda();"
    ]
   },
@@ -1920,8 +1930,7 @@     "    with torch.no_grad():\n",
     "        audio = Amodel.decode([(Atoks.reshape(-1,2).T.unsqueeze(0), torch.tensor(1).cuda())])[0]\n",
     "    torchaudio.save(name, audio.cpu(), 24000)\n",
    "source": [
-  {
    "source": [
    ]
   },
   {
@@ -1975,7 +1983,7 @@     {
      "data": {
       "text/plain": [
    "source": [
-   "id": "e361412e",
+       "      <th>atoks</th>\n",
       ]
      },
      "execution_count": null,
@@ -2023,7 +2031,7 @@     {
      "data": {
       "text/html": [
    "source": [
-   "outputs": [
+       "      <th>stoks</th>\n",
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -2036,8 +2044,8 @@    ],
    "source": [
     "# the ground truth compressed speech (the best we can hope for)\n",
    "source": [
-   "execution_count": null,
    "source": [
+  {
    ]
   },
   {
@@ -2080,13 +2088,54 @@        "\n",
        "    <div>\n",
        "      <progress value='4500' class='' max='4500' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
    "source": [
+       "    <tr>\n",
+       "    </div>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
       ]
+     },
+     "metadata": {},
+    "        speakers.append(name.parents[1].name)\n",
    "metadata": {},
+    "        speakers.append(name.parents[1].name)\n",
    "outputs": [],
    "execution_count": null,
+   "outputs": [],
+   "execution_count": null,
    "source": [
+      "text/html": [
+       "<a href=\"test-gen-T0.6.wav\" target=\"_blank\">Listen to sample test-gen-T0.6.wav</a>"
+{
  "cells": [
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+   "id": "0a853249",
  "cells": [
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "    /* Turns off some styling */\n",
+       "    progress {\n",
+       "        /* gets rid of default border in Firefox and Opera. */\n",
+       "        border: none;\n",
+       "        /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
+       "        background-size: auto;\n",
+       "    }\n",
+       "    progress:not([value]), progress:not([value])::-webkit-progress-bar {\n",
+       "        background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px);\n",
+       "    }\n",
+       "    .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
+       "        background: #F44336;\n",
+       "    }\n",
+       "</style>\n"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -2098,8 +2146,26 @@     },
     {
      "data": {
       "text/html": [
+       "\n",
+       "    <div>\n",
        "  </thead>\n",
+{
+       "      100.00% [4500/4500 00:35&lt;00:00]\n",
+       "    </div>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+   "id": "0a853249",
   {
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<a href=\"test-gen-T0.7.wav\" target=\"_blank\">Listen to sample test-gen-T0.7.wav</a>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -2107,37 +2173,159 @@       ]
      },
      "metadata": {},
      "output_type": "display_data"
+   "cell_type": "code",
    "id": "0a853249",
+   "outputs": [],
+    {
+     "data": {
    "metadata": {},
+  {
+       "\n",
+   "cell_type": "code",
    "id": "0a853249",
+   "source": [
+       "    /* Turns off some styling */\n",
+       "    progress {\n",
+       "        /* gets rid of default border in Firefox and Opera. */\n",
+       "        border: none;\n",
+       "        /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
+       "        background-size: auto;\n",
+   "metadata": {},
    "outputs": [],
+       "    progress:not([value]), progress:not([value])::-webkit-progress-bar {\n",
+       "        background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px);\n",
+       "    }\n",
+       "    .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
+    "        grps.append(grp)"
    "source": [
+       "    }\n",
+       "</style>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
        "  </thead>\n",
+{
+       "      <td>1</td>\n",
    "cell_type": "code",
+       "    </div>\n",
    "source": [
+   "id": "13462aa4",
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
      "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+   "execution_count": null,
    "source": [
+      "text/html": [
+       "<a href=\"test-gen-T0.8.wav\" target=\"_blank\">Listen to sample test-gen-T0.8.wav</a>"
+      ],
+      "text/plain": [
+   "cell_type": "code",
      "output_type": "execute_result"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+   "execution_count": null,
    "source": [
+      "text/html": [
+       "\n",
+   "cell_type": "code",
    "id": "0a853249",
+   "source": [
+   "cell_type": "code",
    "metadata": {},
+       "    progress {\n",
+       "        /* gets rid of default border in Firefox and Opera. */\n",
+       "        border: none;\n",
+       "        /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
+       "        background-size: auto;\n",
+       "    }\n",
+       "    progress:not([value]), progress:not([value])::-webkit-progress-bar {\n",
+       "        background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px);\n",
+       "    }\n",
+       "    .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
+    "        grps.append(grp)"
    "source": [
+       "    }\n",
+       "</style>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+   "cell_type": "code",
    ],
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
        "  </thead>\n",
+{
    "source": [
+       "  </thead>\n",
+       "    </div>\n",
    "source": [
+   "id": "13462aa4",
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+    "        speakers.append(name.parents[1].name)\n",
    "metadata": {},
+    },
+    {
+     "data": {
+      "text/html": [
+       "<a href=\"test-gen-T0.9.wav\" target=\"_blank\">Listen to sample test-gen-T0.9.wav</a>"
+    "import torch\n",
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
-   "metadata": {},
+    "# generate output using sampling, one token at a time\n",
+    "for T in [\"0.6\", \"0.7\", \"0.8\", \"0.9\", \"1.0\"]:\n",
+    "    toks = []\n",
+    "    for i in progress_bar(range(4500)):\n",
+    "        p, loss = model(bx[3:4], torch.tensor([toks]).cuda(), loss=None)\n",
+    "        last_p = p[0,-1]\n",
 {
+   "execution_count": null,
+    "    save_wav(f'test-gen-T{T}.wav', torch.tensor(toks).cuda())"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "source": [
+       "      <th>2306</th>\n",
    "metadata": {},
- "cells": [
    "metadata": {},
    "outputs": [
     {
@@ -2159,8 +2346,8 @@   }
  ],
  "metadata": {
   "kernelspec": {
-   "source": [
+       "      <th>2306</th>\n",
    "outputs": [],
    "language": "python",
    "name": "python3"
   }