Spaces:

yalsaffar
/

S3TVR-Demo

Sleeping

App Files Files Community

yalsaffar commited on Jun 26

Commit

aa7cb02

•

1 Parent(s): 668cc09

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +1 -0
Dockerfile +100 -0
__pycache__/app.cpython-311.pyc +0 -0
__pycache__/inference_functions.cpython-311.pyc +0 -0
__pycache__/load_models.cpython-311.pyc +0 -0
__pycache__/server.cpython-311.pyc +0 -0
__pycache__/silence_removal.cpython-311.pyc +0 -0
__pycache__/stream_VAD.cpython-311.pyc +0 -0
__pycache__/stream_VAD2.cpython-311.pyc +0 -0
__pycache__/stream_prod_main2.cpython-311.pyc +0 -0
app.py +79 -0
app/package-lock.json +985 -0
app/package.json +19 -0
app/public/app.js +99 -0
app/public/index.html +35 -0
app/public/styles.css +96 -0
app/server.js +102 -0
app/temp_wav_files/audio-1718725396714.wav +0 -0
app/uploads/1/audio_2.wav +0 -0
app/uploads/1/transcription_2.txt +1 -0
audio_segments/readme +0 -0
inference_functions.py +80 -0
load_models.py +18 -0
main.ipynb +395 -0
main.py +79 -0
main_stream.ipynb +87 -0
models/TTS_utils.py +365 -0
models/__init__.py +0 -0
models/__pycache__/TTS_utils.cpython-311.pyc +0 -0
models/__pycache__/__init__.cpython-311.pyc +0 -0
models/__pycache__/__init__.cpython-38.pyc +0 -0
models/__pycache__/es_fastconformer.cpython-311.pyc +0 -0
models/__pycache__/nllb.cpython-311.pyc +0 -0
models/__pycache__/nllb.cpython-38.pyc +0 -0
models/__pycache__/noise_red.cpython-311.pyc +0 -0
models/__pycache__/parakeet.cpython-311.pyc +0 -0
models/__pycache__/parakeet.cpython-38.pyc +0 -0
models/es_fastconformer.py +37 -0
models/nllb.py +72 -0
models/noise_red.py +28 -0
models/parakeet.py +43 -0
models/status.txt +1 -0
record_per.json +1 -0
record_temp.json +1 -0
requirements.txt +25 -0
results/readme +0 -0
run.py +73 -0
setup.sh +18 -0
status.txt +1 -0
stream_VAD.py +249 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ app/node_modules

Dockerfile ADDED Viewed

	@@ -0,0 +1,100 @@

+# Use an official CUDA-enabled image from NVIDIA with CUDA 12.1
+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
+# Set the working directory in the container
+WORKDIR /app
+# Set the environment variable to suppress interactive prompts
+ENV DEBIAN_FRONTEND=noninteractive
+# Install necessary OS packages and Python 3.9
+RUN apt-get update && apt-get install -y \
+    software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update && apt-get install -y \
+    python3.9 \
+    python3.9-distutils \
+    python3.9-venv \
+    python3.9-dev \
+    build-essential \
+    cmake \
+    libsndfile1 \
+    ffmpeg \
+    portaudio19-dev \
+    alsa-utils \
+    curl \
+    git \
+    nodejs \
+    npm \
+    && rm -rf /var/lib/apt/lists/*
+# Install pip for Python 3.9
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9
+# Create a symlink for python3.9 and pip3.9
+RUN ln -s /usr/bin/python3.9 /usr/bin/python
+RUN ln -s /usr/local/bin/pip /usr/bin/pip
+# Set CUDA_HOME environment variable
+ENV CUDA_HOME=/usr/local/cuda
+# Add CUDA to PATH
+ENV PATH=${CUDA_HOME}/bin:${PATH}
+# Optionally set LD_LIBRARY_PATH for CUDA libraries
+ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+# Set environment variable for NeMo cache directory
+ENV NEMO_NLP_TMP=/app/.cache
+# Create cache directory
+RUN mkdir -p /app/.cache
+# Copy the setup script and requirements file into the container
+COPY setup.sh requirements.txt /app/
+# Make the setup script executable
+RUN chmod +x setup.sh
+# Copy the application code into the container
+COPY . /app
+# Copy wait-for-it script
+COPY wait-for-it.sh /app/wait-for-it.sh
+# Make wait-for-it script executable
+RUN chmod +x /app/wait-for-it.sh
+# Install dependencies
+RUN pip install --upgrade pip setuptools wheel
+RUN pip install pybind11
+RUN pip install fasttext
+RUN pip install Cython
+RUN pip install pyaudio
+RUN pip install fastapi uvicorn
+RUN pip install uvloop
+# Install PyTorch and torchaudio
+RUN pip install torch==2.2.2+cu121 -f https://download.pytorch.org/whl/cu121/torch_stable.html
+RUN pip install torchaudio==2.2.2+cu121 -f https://download.pytorch.org/whl/cu121/torch_stable.html
+# Install the requirements
+RUN pip install -r requirements.txt
+# Clone and install TTS
+RUN git clone https://github.com/coqui-ai/TTS/ && \
+    cd TTS && \
+    make install
+# Install Node.js dependencies
+RUN cd /app/app && npm install
+# Expose the ports
+EXPOSE 8000
+EXPOSE 3000
+# Set the environment variable to indicate running in Docker
+ENV IN_DOCKER=True
+# Run the FastAPI app and Node.js server
+CMD ["sh", "-c", "uvicorn app:app --host 0.0.0.0 --port 8000 & /app/wait-for-it.sh --url http://0.0.0.0:8000/health --strict -- node /app/app/server.js"]

__pycache__/app.cpython-311.pyc ADDED Viewed

Binary file (5.74 kB). View file

__pycache__/inference_functions.cpython-311.pyc ADDED Viewed

Binary file (7.07 kB). View file

__pycache__/load_models.cpython-311.pyc ADDED Viewed

Binary file (817 Bytes). View file

__pycache__/server.cpython-311.pyc ADDED Viewed

Binary file (2.21 kB). View file

__pycache__/silence_removal.cpython-311.pyc ADDED Viewed

Binary file (1.6 kB). View file

__pycache__/stream_VAD.cpython-311.pyc ADDED Viewed

Binary file (13 kB). View file

__pycache__/stream_VAD2.cpython-311.pyc ADDED Viewed

Binary file (13.5 kB). View file

__pycache__/stream_prod_main2.cpython-311.pyc ADDED Viewed

Binary file (4.17 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import fastapi
+import uvicorn
+from fastapi import File, UploadFile, Form, HTTPException
+from fastapi.responses import JSONResponse, FileResponse
+from load_models import get_nllb_model_and_tokenizer, get_xtts_model
+from inference_functions import translate, just_inference
+import os
+import torch
+# Set GPU memory fraction
+torch.cuda.set_per_process_memory_fraction(0.75, 0)
+# Load models
+model_nllb, tokenizer_nllb = get_nllb_model_and_tokenizer()
+model_xtts = get_xtts_model()
+app = fastapi.FastAPI()
+@app.get("/health")
+def health_check():
+    return {"status": "ok"}
+@app.post("/translate/")
+def translate_text(text: str = Form(...), target_lang: str = Form(...)):
+    translation = translate(model_nllb, tokenizer_nllb, text, target_lang)
+    return {"translation": translation}
+@app.post("/inference/")
+def inference_audio(original_path: UploadFile = File(...), text: str = Form(...), lang: str = Form(...)):
+    # Save the uploaded file
+    file_location = f"/tmp/{original_path.filename}"
+    with open(file_location, "wb") as file:
+        file.write(original_path.file.read())
+    output_dir = f"/tmp/generated_audio_{os.path.basename(file_location)}.wav"
+    torch.cuda.empty_cache()
+    generated_audio = just_inference(model_xtts, file_location, output_dir, text, lang)
+    return {"path_to_save": output_dir}
+@app.post("/process-audio/")
+async def process_audio(original_path: UploadFile = File(...), text: str = Form(...), lang: str = Form(...), target_lang: str = Form(...)):
+    print(f"original_path: {original_path.filename}")
+    print(f"text: {text}")
+    print(f"lang: {lang}")
+    print(f"target_lang: {target_lang}")
+    # Validate target language
+    if target_lang not in ["es", "en"]:  # Use 'es' and 'en' to match the example values
+        print("Unsupported language")
+        raise HTTPException(status_code=400, detail="Unsupported language. Use 'spanish' or 'english'.")
+    try:
+        # Translate the text first
+        translated_text = translate(model_nllb, tokenizer_nllb, text, target_lang)
+        print(f"translated_text: {translated_text}")
+        # Save the uploaded file
+        file_location = f"/tmp/{original_path.filename}"
+        with open(file_location, "wb") as file:
+            file.write(original_path.file.read())
+        output_dir = f"/tmp/generated_audio_{os.path.basename(file_location)}.wav"
+        torch.cuda.empty_cache()
+        generated_audio = just_inference(model_xtts, file_location, output_dir, translated_text, target_lang)
+        return JSONResponse(content={"audio_path": output_dir, "translation": translated_text})
+    except Exception as e:
+        print(f"Error during processing: {e}")
+        raise HTTPException(status_code=500, detail="Error during processing")
+@app.get("/download-audio/")
+def download_audio(file_path: str):
+    if not os.path.exists(file_path):
+        raise HTTPException(status_code=404, detail="File not found")
+    return FileResponse(file_path, media_type='audio/wav', filename=os.path.basename(file_path))
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

app/package-lock.json ADDED Viewed

	@@ -0,0 +1,985 @@

+{
+  "name": "audio-transcription",
+  "version": "1.0.0",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "name": "audio-transcription",
+      "version": "1.0.0",
+      "license": "ISC",
+      "dependencies": {
+        "express": "^4.19.2",
+        "form-data": "^4.0.0",
+        "multer": "^1.4.5-lts.1",
+        "node-fetch": "^2.7.0",
+        "wav": "^1.0.2"
+      }
+    },
+    "node_modules/accepts": {
+      "version": "1.3.8",
+      "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.8.tgz",
+      "integrity": "sha512-PYAthTa2m2VKxuvSD3DPC/Gy+U+sOA1LAuT8mkmRuvw+NACSaeXEQ+NHcVF7rONl6qcaxV3Uuemwawk+7+SJLw==",
+      "dependencies": {
+        "mime-types": "~2.1.34",
+        "negotiator": "0.6.3"
+      },
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/append-field": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/append-field/-/append-field-1.0.0.tgz",
+      "integrity": "sha512-klpgFSWLW1ZEs8svjfb7g4qWY0YS5imI82dTg+QahUvJ8YqAY0P10Uk8tTyh9ZGuYEZEMaeJYCF5BFuX552hsw=="
+    },
+    "node_modules/array-flatten": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz",
+      "integrity": "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg=="
+    },
+    "node_modules/asynckit": {
+      "version": "0.4.0",
+      "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
+      "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="
+    },
+    "node_modules/body-parser": {
+      "version": "1.20.2",
+      "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.2.tgz",
+      "integrity": "sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==",
+      "dependencies": {
+        "bytes": "3.1.2",
+        "content-type": "~1.0.5",
+        "debug": "2.6.9",
+        "depd": "2.0.0",
+        "destroy": "1.2.0",
+        "http-errors": "2.0.0",
+        "iconv-lite": "0.4.24",
+        "on-finished": "2.4.1",
+        "qs": "6.11.0",
+        "raw-body": "2.5.2",
+        "type-is": "~1.6.18",
+        "unpipe": "1.0.0"
+      },
+      "engines": {
+        "node": ">= 0.8",
+        "npm": "1.2.8000 || >= 1.4.16"
+      }
+    },
+    "node_modules/buffer-alloc": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/buffer-alloc/-/buffer-alloc-1.2.0.tgz",
+      "integrity": "sha512-CFsHQgjtW1UChdXgbyJGtnm+O/uLQeZdtbDo8mfUgYXCHSM1wgrVxXm6bSyrUuErEb+4sYVGCzASBRot7zyrow==",
+      "dependencies": {
+        "buffer-alloc-unsafe": "^1.1.0",
+        "buffer-fill": "^1.0.0"
+      }
+    },
+    "node_modules/buffer-alloc-unsafe": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/buffer-alloc-unsafe/-/buffer-alloc-unsafe-1.1.0.tgz",
+      "integrity": "sha512-TEM2iMIEQdJ2yjPJoSIsldnleVaAk1oW3DBVUykyOLsEsFmEc9kn+SFFPz+gl54KQNxlDnAwCXosOS9Okx2xAg=="
+    },
+    "node_modules/buffer-fill": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/buffer-fill/-/buffer-fill-1.0.0.tgz",
+      "integrity": "sha512-T7zexNBwiiaCOGDg9xNX9PBmjrubblRkENuptryuI64URkXDFum9il/JGL8Lm8wYfAXpredVXXZz7eMHilimiQ=="
+    },
+    "node_modules/buffer-from": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz",
+      "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ=="
+    },
+    "node_modules/busboy": {
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/busboy/-/busboy-1.6.0.tgz",
+      "integrity": "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==",
+      "dependencies": {
+        "streamsearch": "^1.1.0"
+      },
+      "engines": {
+        "node": ">=10.16.0"
+      }
+    },
+    "node_modules/bytes": {
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
+      "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/call-bind": {
+      "version": "1.0.7",
+      "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.7.tgz",
+      "integrity": "sha512-GHTSNSYICQ7scH7sZ+M2rFopRoLh8t2bLSW6BbgrtLsahOIB5iyAVJf9GjWK3cYTDaMj4XdBpM1cA6pIS0Kv2w==",
+      "dependencies": {
+        "es-define-property": "^1.0.0",
+        "es-errors": "^1.3.0",
+        "function-bind": "^1.1.2",
+        "get-intrinsic": "^1.2.4",
+        "set-function-length": "^1.2.1"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/combined-stream": {
+      "version": "1.0.8",
+      "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
+      "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
+      "dependencies": {
+        "delayed-stream": "~1.0.0"
+      },
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/concat-stream": {
+      "version": "1.6.2",
+      "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
+      "integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==",
+      "engines": [
+        "node >= 0.8"
+      ],
+      "dependencies": {
+        "buffer-from": "^1.0.0",
+        "inherits": "^2.0.3",
+        "readable-stream": "^2.2.2",
+        "typedarray": "^0.0.6"
+      }
+    },
+    "node_modules/content-disposition": {
+      "version": "0.5.4",
+      "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.4.tgz",
+      "integrity": "sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ==",
+      "dependencies": {
+        "safe-buffer": "5.2.1"
+      },
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/content-type": {
+      "version": "1.0.5",
+      "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.5.tgz",
+      "integrity": "sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/cookie": {
+      "version": "0.6.0",
+      "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.6.0.tgz",
+      "integrity": "sha512-U71cyTamuh1CRNCfpGY6to28lxvNwPG4Guz/EVjgf3Jmzv0vlDp1atT9eS5dDjMYHucpHbWns6Lwf3BKz6svdw==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/cookie-signature": {
+      "version": "1.0.6",
+      "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz",
+      "integrity": "sha512-QADzlaHc8icV8I7vbaJXJwod9HWYp8uCqf1xa4OfNu1T7JVxQIrUgOWtHdNDtPiywmFbiS12VjotIXLrKM3orQ=="
+    },
+    "node_modules/core-util-is": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz",
+      "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ=="
+    },
+    "node_modules/debug": {
+      "version": "2.6.9",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
+      "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
+      "dependencies": {
+        "ms": "2.0.0"
+      }
+    },
+    "node_modules/define-data-property": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz",
+      "integrity": "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A==",
+      "dependencies": {
+        "es-define-property": "^1.0.0",
+        "es-errors": "^1.3.0",
+        "gopd": "^1.0.1"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/delayed-stream": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
+      "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==",
+      "engines": {
+        "node": ">=0.4.0"
+      }
+    },
+    "node_modules/depd": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz",
+      "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/destroy": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.2.0.tgz",
+      "integrity": "sha512-2sJGJTaXIIaR1w4iJSNoN0hnMY7Gpc/n8D4qSCJw8QqFWXf7cuAgnEHxBpweaVcPevC2l3KpjYCx3NypQQgaJg==",
+      "engines": {
+        "node": ">= 0.8",
+        "npm": "1.2.8000 || >= 1.4.16"
+      }
+    },
+    "node_modules/ee-first": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
+      "integrity": "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow=="
+    },
+    "node_modules/encodeurl": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
+      "integrity": "sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/es-define-property": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.0.tgz",
+      "integrity": "sha512-jxayLKShrEqqzJ0eumQbVhTYQM27CfT1T35+gCgDFoL82JLsXqTJ76zv6A0YLOgEnLUMvLzsDsGIrl8NFpT2gQ==",
+      "dependencies": {
+        "get-intrinsic": "^1.2.4"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/es-errors": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
+      "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==",
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/escape-html": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz",
+      "integrity": "sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow=="
+    },
+    "node_modules/etag": {
+      "version": "1.8.1",
+      "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz",
+      "integrity": "sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/express": {
+      "version": "4.19.2",
+      "resolved": "https://registry.npmjs.org/express/-/express-4.19.2.tgz",
+      "integrity": "sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==",
+      "dependencies": {
+        "accepts": "~1.3.8",
+        "array-flatten": "1.1.1",
+        "body-parser": "1.20.2",
+        "content-disposition": "0.5.4",
+        "content-type": "~1.0.4",
+        "cookie": "0.6.0",
+        "cookie-signature": "1.0.6",
+        "debug": "2.6.9",
+        "depd": "2.0.0",
+        "encodeurl": "~1.0.2",
+        "escape-html": "~1.0.3",
+        "etag": "~1.8.1",
+        "finalhandler": "1.2.0",
+        "fresh": "0.5.2",
+        "http-errors": "2.0.0",
+        "merge-descriptors": "1.0.1",
+        "methods": "~1.1.2",
+        "on-finished": "2.4.1",
+        "parseurl": "~1.3.3",
+        "path-to-regexp": "0.1.7",
+        "proxy-addr": "~2.0.7",
+        "qs": "6.11.0",
+        "range-parser": "~1.2.1",
+        "safe-buffer": "5.2.1",
+        "send": "0.18.0",
+        "serve-static": "1.15.0",
+        "setprototypeof": "1.2.0",
+        "statuses": "2.0.1",
+        "type-is": "~1.6.18",
+        "utils-merge": "1.0.1",
+        "vary": "~1.1.2"
+      },
+      "engines": {
+        "node": ">= 0.10.0"
+      }
+    },
+    "node_modules/finalhandler": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.2.0.tgz",
+      "integrity": "sha512-5uXcUVftlQMFnWC9qu/svkWv3GTd2PfUhK/3PLkYNAe7FbqJMt3515HaxE6eRL74GdsriiwujiawdaB1BpEISg==",
+      "dependencies": {
+        "debug": "2.6.9",
+        "encodeurl": "~1.0.2",
+        "escape-html": "~1.0.3",
+        "on-finished": "2.4.1",
+        "parseurl": "~1.3.3",
+        "statuses": "2.0.1",
+        "unpipe": "~1.0.0"
+      },
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/form-data": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz",
+      "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==",
+      "dependencies": {
+        "asynckit": "^0.4.0",
+        "combined-stream": "^1.0.8",
+        "mime-types": "^2.1.12"
+      },
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/forwarded": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz",
+      "integrity": "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/fresh": {
+      "version": "0.5.2",
+      "resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz",
+      "integrity": "sha512-zJ2mQYM18rEFOudeV4GShTGIQ7RbzA7ozbU9I/XBpm7kqgMywgmylMwXHxZJmkVoYkna9d2pVXVXPdYTP9ej8Q==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/function-bind": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
+      "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==",
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/get-intrinsic": {
+      "version": "1.2.4",
+      "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.2.4.tgz",
+      "integrity": "sha512-5uYhsJH8VJBTv7oslg4BznJYhDoRI6waYCxMmCdnTrcCrHA/fCFKoTFz2JKKE0HdDFUF7/oQuhzumXJK7paBRQ==",
+      "dependencies": {
+        "es-errors": "^1.3.0",
+        "function-bind": "^1.1.2",
+        "has-proto": "^1.0.1",
+        "has-symbols": "^1.0.3",
+        "hasown": "^2.0.0"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/gopd": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.0.1.tgz",
+      "integrity": "sha512-d65bNlIadxvpb/A2abVdlqKqV563juRnZ1Wtk6s1sIR8uNsXR70xqIzVqxVf1eTqDunwT2MkczEeaezCKTZhwA==",
+      "dependencies": {
+        "get-intrinsic": "^1.1.3"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/has-property-descriptors": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.2.tgz",
+      "integrity": "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg==",
+      "dependencies": {
+        "es-define-property": "^1.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/has-proto": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/has-proto/-/has-proto-1.0.3.tgz",
+      "integrity": "sha512-SJ1amZAJUiZS+PhsVLf5tGydlaVB8EdFpaSO4gmiUKUOxk8qzn5AIy4ZeJUmh22znIdk/uMAUT2pl3FxzVUH+Q==",
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/has-symbols": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.0.3.tgz",
+      "integrity": "sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A==",
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/hasown": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz",
+      "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==",
+      "dependencies": {
+        "function-bind": "^1.1.2"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/http-errors": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
+      "integrity": "sha512-FtwrG/euBzaEjYeRqOgly7G0qviiXoJWnvEH2Z1plBdXgbyjv34pHTSb9zoeHMyDy33+DWy5Wt9Wo+TURtOYSQ==",
+      "dependencies": {
+        "depd": "2.0.0",
+        "inherits": "2.0.4",
+        "setprototypeof": "1.2.0",
+        "statuses": "2.0.1",
+        "toidentifier": "1.0.1"
+      },
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/iconv-lite": {
+      "version": "0.4.24",
+      "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
+      "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==",
+      "dependencies": {
+        "safer-buffer": ">= 2.1.2 < 3"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/inherits": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
+      "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="
+    },
+    "node_modules/ipaddr.js": {
+      "version": "1.9.1",
+      "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz",
+      "integrity": "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==",
+      "engines": {
+        "node": ">= 0.10"
+      }
+    },
+    "node_modules/isarray": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz",
+      "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ=="
+    },
+    "node_modules/media-typer": {
+      "version": "0.3.0",
+      "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
+      "integrity": "sha512-dq+qelQ9akHpcOl/gUVRTxVIOkAJ1wR3QAvb4RsVjS8oVoFjDGTc679wJYmUmknUF5HwMLOgb5O+a3KxfWapPQ==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/merge-descriptors": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz",
+      "integrity": "sha512-cCi6g3/Zr1iqQi6ySbseM1Xvooa98N0w31jzUYrXPX2xqObmFGHJ0tQ5u74H3mVh7wLouTseZyYIq39g8cNp1w=="
+    },
+    "node_modules/methods": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz",
+      "integrity": "sha512-iclAHeNqNm68zFtnZ0e+1L2yUIdvzNoauKU4WBA3VvH/vPFieF7qfRlwUZU+DA9P9bPXIS90ulxoUoCH23sV2w==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/mime": {
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz",
+      "integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==",
+      "bin": {
+        "mime": "cli.js"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/mime-db": {
+      "version": "1.52.0",
+      "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
+      "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/mime-types": {
+      "version": "2.1.35",
+      "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
+      "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
+      "dependencies": {
+        "mime-db": "1.52.0"
+      },
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/minimist": {
+      "version": "1.2.8",
+      "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
+      "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==",
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/mkdirp": {
+      "version": "0.5.6",
+      "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.6.tgz",
+      "integrity": "sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==",
+      "dependencies": {
+        "minimist": "^1.2.6"
+      },
+      "bin": {
+        "mkdirp": "bin/cmd.js"
+      }
+    },
+    "node_modules/ms": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
+      "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A=="
+    },
+    "node_modules/multer": {
+      "version": "1.4.5-lts.1",
+      "resolved": "https://registry.npmjs.org/multer/-/multer-1.4.5-lts.1.tgz",
+      "integrity": "sha512-ywPWvcDMeH+z9gQq5qYHCCy+ethsk4goepZ45GLD63fOu0YcNecQxi64nDs3qluZB+murG3/D4dJ7+dGctcCQQ==",
+      "dependencies": {
+        "append-field": "^1.0.0",
+        "busboy": "^1.0.0",
+        "concat-stream": "^1.5.2",
+        "mkdirp": "^0.5.4",
+        "object-assign": "^4.1.1",
+        "type-is": "^1.6.4",
+        "xtend": "^4.0.0"
+      },
+      "engines": {
+        "node": ">= 6.0.0"
+      }
+    },
+    "node_modules/negotiator": {
+      "version": "0.6.3",
+      "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.3.tgz",
+      "integrity": "sha512-+EUsqGPLsM+j/zdChZjsnX51g4XrHFOIXwfnCVPGlQk/k5giakcKsuxCObBRu6DSm9opw/O6slWbJdghQM4bBg==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/node-fetch": {
+      "version": "2.7.0",
+      "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz",
+      "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==",
+      "dependencies": {
+        "whatwg-url": "^5.0.0"
+      },
+      "engines": {
+        "node": "4.x || >=6.0.0"
+      },
+      "peerDependencies": {
+        "encoding": "^0.1.0"
+      },
+      "peerDependenciesMeta": {
+        "encoding": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/object-assign": {
+      "version": "4.1.1",
+      "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
+      "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/object-inspect": {
+      "version": "1.13.1",
+      "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.1.tgz",
+      "integrity": "sha512-5qoj1RUiKOMsCCNLV1CBiPYE10sziTsnmNxkAI/rZhiD63CF7IqdFGC/XzjWjpSgLf0LxXX3bDFIh0E18f6UhQ==",
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/on-finished": {
+      "version": "2.4.1",
+      "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.4.1.tgz",
+      "integrity": "sha512-oVlzkg3ENAhCk2zdv7IJwd/QUD4z2RxRwpkcGY8psCVcCYZNq4wYnVWALHM+brtuJjePWiYF/ClmuDr8Ch5+kg==",
+      "dependencies": {
+        "ee-first": "1.1.1"
+      },
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/parseurl": {
+      "version": "1.3.3",
+      "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
+      "integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/path-to-regexp": {
+      "version": "0.1.7",
+      "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz",
+      "integrity": "sha512-5DFkuoqlv1uYQKxy8omFBeJPQcdoE07Kv2sferDCrAq1ohOU+MSDswDIbnx3YAM60qIOnYa53wBhXW0EbMonrQ=="
+    },
+    "node_modules/process-nextick-args": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz",
+      "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag=="
+    },
+    "node_modules/proxy-addr": {
+      "version": "2.0.7",
+      "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz",
+      "integrity": "sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==",
+      "dependencies": {
+        "forwarded": "0.2.0",
+        "ipaddr.js": "1.9.1"
+      },
+      "engines": {
+        "node": ">= 0.10"
+      }
+    },
+    "node_modules/qs": {
+      "version": "6.11.0",
+      "resolved": "https://registry.npmjs.org/qs/-/qs-6.11.0.tgz",
+      "integrity": "sha512-MvjoMCJwEarSbUYk5O+nmoSzSutSsTwF85zcHPQ9OrlFoZOYIjaqBAJIqIXjptyD5vThxGq52Xu/MaJzRkIk4Q==",
+      "dependencies": {
+        "side-channel": "^1.0.4"
+      },
+      "engines": {
+        "node": ">=0.6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/range-parser": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
+      "integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/raw-body": {
+      "version": "2.5.2",
+      "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.2.tgz",
+      "integrity": "sha512-8zGqypfENjCIqGhgXToC8aB2r7YrBX+AQAfIPs/Mlk+BtPTztOvTS01NRW/3Eh60J+a48lt8qsCzirQ6loCVfA==",
+      "dependencies": {
+        "bytes": "3.1.2",
+        "http-errors": "2.0.0",
+        "iconv-lite": "0.4.24",
+        "unpipe": "1.0.0"
+      },
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/readable-stream": {
+      "version": "2.3.8",
+      "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz",
+      "integrity": "sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==",
+      "dependencies": {
+        "core-util-is": "~1.0.0",
+        "inherits": "~2.0.3",
+        "isarray": "~1.0.0",
+        "process-nextick-args": "~2.0.0",
+        "safe-buffer": "~5.1.1",
+        "string_decoder": "~1.1.1",
+        "util-deprecate": "~1.0.1"
+      }
+    },
+    "node_modules/readable-stream/node_modules/safe-buffer": {
+      "version": "5.1.2",
+      "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
+      "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g=="
+    },
+    "node_modules/safe-buffer": {
+      "version": "5.2.1",
+      "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
+      "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ]
+    },
+    "node_modules/safer-buffer": {
+      "version": "2.1.2",
+      "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
+      "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="
+    },
+    "node_modules/send": {
+      "version": "0.18.0",
+      "resolved": "https://registry.npmjs.org/send/-/send-0.18.0.tgz",
+      "integrity": "sha512-qqWzuOjSFOuqPjFe4NOsMLafToQQwBSOEpS+FwEt3A2V3vKubTquT3vmLTQpFgMXp8AlFWFuP1qKaJZOtPpVXg==",
+      "dependencies": {
+        "debug": "2.6.9",
+        "depd": "2.0.0",
+        "destroy": "1.2.0",
+        "encodeurl": "~1.0.2",
+        "escape-html": "~1.0.3",
+        "etag": "~1.8.1",
+        "fresh": "0.5.2",
+        "http-errors": "2.0.0",
+        "mime": "1.6.0",
+        "ms": "2.1.3",
+        "on-finished": "2.4.1",
+        "range-parser": "~1.2.1",
+        "statuses": "2.0.1"
+      },
+      "engines": {
+        "node": ">= 0.8.0"
+      }
+    },
+    "node_modules/send/node_modules/ms": {
+      "version": "2.1.3",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+      "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="
+    },
+    "node_modules/serve-static": {
+      "version": "1.15.0",
+      "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.15.0.tgz",
+      "integrity": "sha512-XGuRDNjXUijsUL0vl6nSD7cwURuzEgglbOaFuZM9g3kwDXOWVTck0jLzjPzGD+TazWbboZYu52/9/XPdUgne9g==",
+      "dependencies": {
+        "encodeurl": "~1.0.2",
+        "escape-html": "~1.0.3",
+        "parseurl": "~1.3.3",
+        "send": "0.18.0"
+      },
+      "engines": {
+        "node": ">= 0.8.0"
+      }
+    },
+    "node_modules/set-function-length": {
+      "version": "1.2.2",
+      "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz",
+      "integrity": "sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==",
+      "dependencies": {
+        "define-data-property": "^1.1.4",
+        "es-errors": "^1.3.0",
+        "function-bind": "^1.1.2",
+        "get-intrinsic": "^1.2.4",
+        "gopd": "^1.0.1",
+        "has-property-descriptors": "^1.0.2"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/setprototypeof": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.2.0.tgz",
+      "integrity": "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw=="
+    },
+    "node_modules/side-channel": {
+      "version": "1.0.6",
+      "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.6.tgz",
+      "integrity": "sha512-fDW/EZ6Q9RiO8eFG8Hj+7u/oW+XrPTIChwCOM2+th2A6OblDtYYIpve9m+KvI9Z4C9qSEXlaGR6bTEYHReuglA==",
+      "dependencies": {
+        "call-bind": "^1.0.7",
+        "es-errors": "^1.3.0",
+        "get-intrinsic": "^1.2.4",
+        "object-inspect": "^1.13.1"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/statuses": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz",
+      "integrity": "sha512-RwNA9Z/7PrK06rYLIzFMlaF+l73iwpzsqRIFgbMLbTcLD6cOao82TaWefPXQvB2fOC4AjuYSEndS7N/mTCbkdQ==",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/stream-parser": {
+      "version": "0.3.1",
+      "resolved": "https://registry.npmjs.org/stream-parser/-/stream-parser-0.3.1.tgz",
+      "integrity": "sha512-bJ/HgKq41nlKvlhccD5kaCr/P+Hu0wPNKPJOH7en+YrJu/9EgqUF+88w5Jb6KNcjOFMhfX4B2asfeAtIGuHObQ==",
+      "dependencies": {
+        "debug": "2"
+      }
+    },
+    "node_modules/streamsearch": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-1.1.0.tgz",
+      "integrity": "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg==",
+      "engines": {
+        "node": ">=10.0.0"
+      }
+    },
+    "node_modules/string_decoder": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
+      "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
+      "dependencies": {
+        "safe-buffer": "~5.1.0"
+      }
+    },
+    "node_modules/string_decoder/node_modules/safe-buffer": {
+      "version": "5.1.2",
+      "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
+      "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g=="
+    },
+    "node_modules/toidentifier": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.1.tgz",
+      "integrity": "sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA==",
+      "engines": {
+        "node": ">=0.6"
+      }
+    },
+    "node_modules/tr46": {
+      "version": "0.0.3",
+      "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz",
+      "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw=="
+    },
+    "node_modules/type-is": {
+      "version": "1.6.18",
+      "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz",
+      "integrity": "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==",
+      "dependencies": {
+        "media-typer": "0.3.0",
+        "mime-types": "~2.1.24"
+      },
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/typedarray": {
+      "version": "0.0.6",
+      "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
+      "integrity": "sha512-/aCDEGatGvZ2BIk+HmLf4ifCJFwvKFNb9/JeZPMulfgFracn9QFcAf5GO8B/mweUjSoblS5In0cWhqpfs/5PQA=="
+    },
+    "node_modules/unpipe": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
+      "integrity": "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/util-deprecate": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
+      "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw=="
+    },
+    "node_modules/utils-merge": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz",
+      "integrity": "sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA==",
+      "engines": {
+        "node": ">= 0.4.0"
+      }
+    },
+    "node_modules/vary": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz",
+      "integrity": "sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg==",
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/wav": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/wav/-/wav-1.0.2.tgz",
+      "integrity": "sha512-viHtz3cDd/Tcr/HbNqzQCofKdF6kWUymH9LGDdskfWFoIy/HJ+RTihgjEcHfnsy1PO4e9B+y4HwgTwMrByquhg==",
+      "dependencies": {
+        "buffer-alloc": "^1.1.0",
+        "buffer-from": "^1.0.0",
+        "debug": "^2.2.0",
+        "readable-stream": "^1.1.14",
+        "stream-parser": "^0.3.1"
+      }
+    },
+    "node_modules/wav/node_modules/isarray": {
+      "version": "0.0.1",
+      "resolved": "https://registry.npmjs.org/isarray/-/isarray-0.0.1.tgz",
+      "integrity": "sha512-D2S+3GLxWH+uhrNEcoh/fnmYeP8E8/zHl644d/jdA0g2uyXvy3sb0qxotE+ne0LtccHknQzWwZEzhak7oJ0COQ=="
+    },
+    "node_modules/wav/node_modules/readable-stream": {
+      "version": "1.1.14",
+      "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-1.1.14.tgz",
+      "integrity": "sha512-+MeVjFf4L44XUkhM1eYbD8fyEsxcV81pqMSR5gblfcLCHfZvbrqy4/qYHE+/R5HoBUT11WV5O08Cr1n3YXkWVQ==",
+      "dependencies": {
+        "core-util-is": "~1.0.0",
+        "inherits": "~2.0.1",
+        "isarray": "0.0.1",
+        "string_decoder": "~0.10.x"
+      }
+    },
+    "node_modules/wav/node_modules/string_decoder": {
+      "version": "0.10.31",
+      "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz",
+      "integrity": "sha512-ev2QzSzWPYmy9GuqfIVildA4OdcGLeFZQrq5ys6RtiuF+RQQiZWr8TZNyAcuVXyQRYfEO+MsoB/1BuQVhOJuoQ=="
+    },
+    "node_modules/webidl-conversions": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
+      "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ=="
+    },
+    "node_modules/whatwg-url": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
+      "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==",
+      "dependencies": {
+        "tr46": "~0.0.3",
+        "webidl-conversions": "^3.0.0"
+      }
+    },
+    "node_modules/xtend": {
+      "version": "4.0.2",
+      "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz",
+      "integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==",
+      "engines": {
+        "node": ">=0.4"
+      }
+    }
+  }
+}

app/package.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "name": "audio-transcription",
+  "version": "1.0.0",
+  "description": "",
+  "main": "app.js",
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "keywords": [],
+  "author": "",
+  "license": "ISC",
+  "dependencies": {
+    "express": "^4.19.2",
+    "form-data": "^4.0.0",
+    "multer": "^1.4.5-lts.1",
+    "node-fetch": "^2.7.0",
+    "wav": "^1.0.2"
+  }
+}

app/public/app.js ADDED Viewed

	@@ -0,0 +1,99 @@

+const recordButton = document.getElementById('record');
+const status = document.getElementById('status');
+const transcriptionElement = document.getElementById('transcription');
+const audioElement = document.getElementById('audio');
+const translationElement = document.getElementById('translation');
+let mediaRecorder;
+let audioChunks = [];
+let transcript = '';
+let sentenceIndex = 0;
+const recognition = new (window.SpeechRecognition || window.webkitSpeechRecognition)();
+recognition.continuous = true;
+recognition.interimResults = true;
+recognition.onresult = (event) => {
+    let interimTranscript = '';
+    for (let i = event.resultIndex; i < event.results.length; ++i) {
+        if (event.results[i].isFinal) {
+            transcript += event.results[i][0].transcript + ' ';
+            saveAudioAndTranscription(event.results[i][0].transcript, sentenceIndex++);
+        } else {
+            interimTranscript += event.results[i][0].transcript;
+        }
+    }
+    transcriptionElement.innerHTML = transcript + '<i style="color:red;">' + interimTranscript + '</i>';
+};
+recognition.onerror = (event) => {
+    console.error(event.error);
+};
+recordButton.onmousedown = async () => {
+    status.textContent = "Recording...";
+    transcript = '';
+    sentenceIndex = 0;
+    // Start speech recognition
+    recognition.start();
+    // Start audio recording
+    const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+    mediaRecorder = new MediaRecorder(stream, { mimeType: 'audio/webm' });
+    mediaRecorder.start();
+    mediaRecorder.ondataavailable = (event) => {
+        audioChunks.push(event.data);
+    };
+};
+recordButton.onmouseup = () => {
+    status.textContent = "Recording stopped";
+    // Stop speech recognition and audio recording
+    recognition.stop();
+    mediaRecorder.stop();
+    // Process the recorded audio
+    saveAudioAndTranscription(transcript, sentenceIndex);
+};
+async function saveAudioAndTranscription(sentence, index) {
+    mediaRecorder.stop();
+    mediaRecorder.onstop = async () => {
+        const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
+        const arrayBuffer = await audioBlob.arrayBuffer();
+        const audioBuffer = new Uint8Array(arrayBuffer);
+        const formData = new FormData();
+        formData.append('audio', new Blob([audioBuffer], { type: 'application/octet-stream' }));
+        formData.append('transcript', sentence);
+        formData.append('sampleRate', mediaRecorder.stream.getAudioTracks()[0].getSettings().sampleRate);
+        formData.append('numberOfChannels', 1);  // Assuming mono audio
+        try {
+            const response = await fetch('/save-audio', {
+                method: 'POST',
+                body: formData
+            });
+            if (response.ok) {
+                const result = await response.json();
+                console.log(`Saved sentence ${index}`);
+                // Show translation and play audio
+                translationElement.textContent = result.translation;
+                audioElement.src = `http://localhost:8000/download-audio?file_path=${result.audio_path}`;
+                audioElement.play();
+            } else {
+                console.error('Failed to save the file.');
+            }
+        } catch (error) {
+            console.error('Error saving audio and transcription:', error);
+        }
+        audioChunks = [];
+        mediaRecorder.start();
+    };
+}

app/public/index.html ADDED Viewed

	@@ -0,0 +1,35 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Audio Recording and Translation</title>
+    <link rel="stylesheet" href="styles.css">
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.4/css/all.min.css">
+    <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Roboto:wght@400;500;700&display=swap">
+</head>
+<body>
+    <div class="container">
+        <header>
+            <h1>Seamless Speech-to-Speech Translation with Voice Replication (S3TVR)</h1>
+            <p class="description">S3TVR is an advanced AI cascaded framework designed for real-time speech-to-speech translation while maintaining the speaker's voice characteristics in a zero-shot fashion. This project balances latency and output quality, focusing on English and Spanish languages, and involves multiple open-source models and algorithms. The system is optimized for local execution, allowing for dynamic and efficient voice translation with an average latency of ~3 seconds per sentence. For the optimized model, check the Github Repo bellow.</p>
+            <p class="description">NOTE: The local excution is streamed and fully optimized(unlike this Demo)</p>
+            <div class="links">
+                <a href="https://github.com/yalsaffar/S3TVR" target="_blank"><i class="fab fa-github"></i></a>
+                <a href="https://yousifalsaffar.com/" target="_blank"><i class="fas fa-globe"></i></a>
+                <a href="https://www.linkedin.com/in/yousif-alsaffar-7621b5142/" target="_blank"><i class="fab fa-linkedin"></i></a>
+                <a href="https://huggingface.co/yalsaffar" target="_blank"><i class="fas fa-robot"></i></a>
+            </div>
+        </header>
+        <div class="circle-button" id="record">
+            <i class="fas fa-microphone"></i>
+        </div>
+        <p id="label">Press and Hold till the sentence is not RED</p>
+        <p id="status"> </p>
+        <div id="transcription" class="text-output"></div>
+        <div id="translation" class="text-output"></div>
+        <audio id="audio" controls></audio>
+    </div>
+    <script src="app.js"></script>
+</body>
+</html>

app/public/styles.css ADDED Viewed

	@@ -0,0 +1,96 @@

+body {
+    font-family: 'Roboto', sans-serif;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    height: 100vh;
+    background-color: #f5f5f5;
+    margin: 0;
+    padding: 20px;
+    box-sizing: border-box;
+}
+.container {
+    text-align: center;
+    max-width: 800px;
+    width: 100%;
+}
+header {
+    margin-bottom: 20px;
+}
+header h1 {
+    font-size: 2em;
+    font-weight: 700;
+    margin-bottom: 10px;
+}
+header .description {
+    font-size: 1.1em;
+    font-weight: 400;
+    color: #555;
+    margin-bottom: 20px;
+    line-height: 1.6;
+}
+.links {
+    display: flex;
+    justify-content: center;
+    gap: 20px;
+    margin-bottom: 20px;
+}
+.links a {
+    color: #333;
+    font-size: 1.5em;
+    transition: color 0.3s;
+}
+.links a:hover {
+    color: #ff4757;
+}
+.circle-button {
+    width: 100px;
+    height: 100px;
+    background-color: #ff4757;
+    border-radius: 50%;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    cursor: pointer;
+    margin: 20px auto;
+    transition: background-color 0.3s ease;
+}
+.circle-button:hover {
+    background-color: #ff6b81;
+}
+.circle-button:active {
+    background-color: #34c759;
+}
+.circle-button i {
+    color: white;
+    font-size: 2em;
+}
+.text-output {
+    background-color: white;
+    border-radius: 5px;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+    padding: 15px;
+    margin: 10px auto;
+    width: 80%;
+    max-width: 500px;
+    text-align: left;
+    font-size: 1em;
+    line-height: 1.5;
+}
+#status {
+    font-weight: bold;
+    margin-top: 10px;
+}

app/server.js ADDED Viewed

	@@ -0,0 +1,102 @@

+const express = require('express');
+const multer = require('multer');
+const path = require('path');
+const fs = require('fs');
+const { exec } = require('child_process');
+const fetch = require('node-fetch');
+const FormData = require('form-data');
+const app = express();
+const port = 3000;
+const uploadsDir = path.join(__dirname, 'uploads');
+if (!fs.existsSync(uploadsDir)) {
+    fs.mkdirSync(uploadsDir);
+}
+const storage = multer.memoryStorage();
+const upload = multer({ storage: storage });
+app.use(express.static(path.join(__dirname, 'public')));
+app.use(express.json());
+const getNextFolderNumber = () => {
+    const folders = fs.readdirSync(uploadsDir).filter(file => fs.statSync(path.join(uploadsDir, file)).isDirectory());
+    const folderNumbers = folders.map(folder => parseInt(folder)).filter(num => !isNaN(num));
+    return folderNumbers.length > 0 ? Math.max(...folderNumbers) + 1 : 1;
+};
+let sentenceIndex = 0;
+let audioPaths = [];
+app.post('/save-audio', upload.single('audio'), async (req, res) => {
+    const nextFolderNumber = getNextFolderNumber();
+    const folderPath = path.join(uploadsDir, nextFolderNumber.toString());
+    if (!fs.existsSync(folderPath)) {
+        fs.mkdirSync(folderPath, { recursive: true });
+    }
+    const rawAudioPath = path.join(folderPath, `audio_${sentenceIndex}.webm`);
+    const wavAudioPath = path.join(folderPath, `audio_${sentenceIndex}.wav`);
+    const transcriptionPath = path.join(folderPath, `transcription_${sentenceIndex}.txt`);
+    fs.writeFileSync(rawAudioPath, req.file.buffer);
+    fs.writeFileSync(transcriptionPath, req.body.transcript);
+    const ffmpegCommand = `ffmpeg -i ${rawAudioPath} -ar 44100 -ac 1 ${wavAudioPath}`;
+    exec(ffmpegCommand, async (error, stdout, stderr) => {
+        if (error) {
+            console.error(`Error converting audio to WAV: ${stderr}`);
+            return res.status(500).send('Error converting audio to WAV');
+        }
+        fs.unlinkSync(rawAudioPath);
+        const formData = new FormData();
+        formData.append('original_path', fs.createReadStream(wavAudioPath));
+        formData.append('text', req.body.transcript);
+        formData.append('lang', 'en');
+        formData.append('target_lang', 'es');
+        try {
+            const response = await fetch('http://localhost:8000/process-audio/', {
+                method: 'POST',
+                body: formData,
+                headers: formData.getHeaders()
+            });
+            if (response.ok) {
+                const result = await response.json();
+                console.log(result);
+                audioPaths.push(result.audio_path);
+                sentenceIndex++;
+                res.status(200).json({ audio_path: result.audio_path, translation: result.translation });
+            } else {
+                console.error('Failed to process the file via FastAPI');
+                res.status(500).send('Failed to process the file via FastAPI');
+            }
+        } catch (error) {
+            console.error('Error calling FastAPI:', error);
+            res.status(500).send('Error calling FastAPI');
+        }
+    });
+});
+app.get('/concatenate-audio', (req, res) => {
+    const folderPath = path.join(uploadsDir, getNextFolderNumber().toString());
+    const finalAudioPath = path.join(folderPath, 'final_audio.wav');
+    const concatCommand = `ffmpeg -y -i "concat:${audioPaths.join('|')}" -acodec copy ${finalAudioPath}`;
+    exec(concatCommand, (concatError, concatStdout, concatStderr) => {
+        if (concatError) {
+            console.error(`Error concatenating audio files: ${concatStderr}`);
+            return res.status(500).send('Error concatenating audio files');
+        }
+        res.status(200).json({ audio_path: finalAudioPath });
+    });
+});
+app.listen(port, () => {
+    console.log(`Server running at http://localhost:${port}`);
+});

app/temp_wav_files/audio-1718725396714.wav ADDED Viewed

Binary file (278 kB). View file

app/uploads/1/audio_2.wav ADDED Viewed

Binary file (307 kB). View file

app/uploads/1/transcription_2.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ okay now we're still actually works

audio_segments/readme ADDED Viewed

File without changes

inference_functions.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import time
+import torch
+import torchaudio
+import noisereduce as nr
+import numpy as np
+from models.nllb import nllb_translate
+def translate(model_nllb, tokenizer_nllb, text, target_lang):
+    print("Processing translation...")
+    start_time = time.time()
+    translation = nllb_translate(model_nllb, tokenizer_nllb, text, target_lang)
+    print("Translation:", translation)
+    print("Translation time:", time.time() - start_time)
+    return translation
+def just_inference(model, original_path, output_dir, text, lang):
+    print("Inference...")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    path_to_save = output_dir
+    t0 = time.time()
+    try:
+        # Load the audio
+        print("Loading audio...")
+        wav, sr = torchaudio.load(original_path)
+        print(f"Loaded audio with sample rate: {sr}")
+        wav = wav.squeeze().numpy()
+        print(f"Audio shape after squeezing: {wav.shape}")
+        # Apply noise reduction
+        print("Applying noise reduction...")
+        reduced_noise_audio = nr.reduce_noise(y=wav, sr=sr)
+        reduced_noise_audio = torch.tensor(reduced_noise_audio).unsqueeze(0)
+        print(f"Reduced noise audio shape: {reduced_noise_audio.shape}")
+        # Move the reduced noise audio to the correct device
+        reduced_noise_audio = reduced_noise_audio.to(device)
+        print("Getting conditioning latents...")
+        gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[original_path])
+        print("Got conditioning latents.")
+        print("Starting inference stream...")
+        chunks = model.inference_stream(
+            text,
+            lang,
+            gpt_cond_latent,
+            speaker_embedding,
+            stream_chunk_size=15,
+            speed=0.95
+        )
+        print("Inference stream started.")
+        full_audio = torch.Tensor().to(device)
+        for i, chunk in enumerate(chunks):
+            try:
+                if i == 1:
+                    time_to_first_chunk = time.time() - t0
+                    print(f"Time to first chunk: {time_to_first_chunk}")
+                full_audio = torch.cat((full_audio, chunk.squeeze().to(device)), dim=-1)
+                print(f"Processed chunk {i}, chunk shape: {chunk.shape}")
+            except Exception as e:
+                print(f"Error processing chunk {i}: {e}")
+                raise
+        # Move full_audio to CPU before saving
+        full_audio = full_audio.cpu()
+        print(f"Saving full audio to {path_to_save}...")
+        torchaudio.save(path_to_save, full_audio.unsqueeze(0), 24000)
+        print("Audio saved.")
+        print("Inference finished")
+        return full_audio
+    except Exception as e:
+        print(f"Error during processing: {e}")
+        raise

load_models.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from models.nllb import nllb
+#from models.parakeet import parakeet_ctc_model
+model_nllb, tokenizer_nllb = nllb()
+from models.TTS_utils import load_manual_xtts_v2
+config_path = "test/config.json"
+model_path = "test"
+xtts_v2_model = load_manual_xtts_v2(config_path, model_path)
+def get_nllb_model_and_tokenizer():
+    return model_nllb, tokenizer_nllb
+def get_xtts_model():
+    return xtts_v2_model

main.ipynb ADDED Viewed

	@@ -0,0 +1,395 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\spn\\anaconda3\\envs\\capstone\\Lib\\site-packages\\torchvision\\io\\image.py:13: UserWarning: Failed to load image Python extension: '[WinError 127] The specified procedure could not be found'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?\n",
+      "  warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-06-10 23:30:49,190] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2024-06-10 23:30:49,544] torch.distributed.elastic.multiprocessing.redirects: [WARNING] NOTE: Redirects are currently not supported in Windows or MacOs.\n",
+      "[NeMo W 2024-06-10 23:30:52 nemo_logging:393] Could not import NeMo NLP collection which is required for speech translation model.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-06-10 23:31:08 nemo_logging:381] Tokenizer SentencePieceTokenizer initialized with 1024 tokens\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[NeMo W 2024-06-10 23:31:08 nemo_logging:393] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
+      "    Train config : \n",
+      "    manifest_filepath: /disk1/NVIDIA/datasets/LibriSpeech_NeMo/librivox-train-all.json\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 16\n",
+      "    shuffle: true\n",
+      "    num_workers: 8\n",
+      "    pin_memory: true\n",
+      "    use_start_end_token: false\n",
+      "    trim_silence: false\n",
+      "    max_duration: 16.7\n",
+      "    min_duration: 0.1\n",
+      "    is_tarred: false\n",
+      "    tarred_audio_filepaths: null\n",
+      "    shuffle_n: 2048\n",
+      "    bucketing_strategy: fully_randomized\n",
+      "    bucketing_batch_size: null\n",
+      "    \n",
+      "[NeMo W 2024-06-10 23:31:08 nemo_logging:393] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
+      "    Validation config : \n",
+      "    manifest_filepath: /disk1/NVIDIA/datasets/LibriSpeech_NeMo/librivox-dev-clean.json\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 16\n",
+      "    shuffle: false\n",
+      "    use_start_end_token: false\n",
+      "    num_workers: 8\n",
+      "    pin_memory: true\n",
+      "    \n",
+      "[NeMo W 2024-06-10 23:31:08 nemo_logging:393] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).\n",
+      "    Test config : \n",
+      "    manifest_filepath: null\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 16\n",
+      "    shuffle: false\n",
+      "    use_start_end_token: false\n",
+      "    num_workers: 8\n",
+      "    pin_memory: true\n",
+      "    \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-06-10 23:31:08 nemo_logging:381] PADDING: 0\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[NeMo W 2024-06-10 23:31:11 nemo_logging:393] `method_cfg` is deprecated and will be removed in the future. Please use `measure_cfg` instead.\n",
+      "[NeMo W 2024-06-10 23:31:11 nemo_logging:393] Re-writing `measure_cfg` with the value of `method_cfg`.\n",
+      "[NeMo W 2024-06-10 23:31:11 nemo_logging:393] `temperature` is deprecated and will be removed in the future. Please use `alpha` instead.\n",
+      "[NeMo W 2024-06-10 23:31:11 nemo_logging:393] Re-writing `alpha` with the value of `temperature`.\n",
+      "[NeMo W 2024-06-10 23:31:11 nemo_logging:393] `method_cfg` is deprecated and will be removed in the future. Please use `measure_cfg` instead.\n",
+      "[NeMo W 2024-06-10 23:31:11 nemo_logging:393] Re-writing `measure_cfg` with the value of `method_cfg`.\n",
+      "[NeMo W 2024-06-10 23:31:11 nemo_logging:393] `temperature` is deprecated and will be removed in the future. Please use `alpha` instead.\n",
+      "[NeMo W 2024-06-10 23:31:11 nemo_logging:393] Re-writing `alpha` with the value of `temperature`.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-06-10 23:31:16 nemo_logging:381] Model EncDecCTCModelBPE was successfully restored from C:\\Users\\spn\\.cache\\huggingface\\hub\\models--nvidia--parakeet-ctc-0.6b\\snapshots\\097ffc5b027beabc73acb627def2d1d278e774e9\\parakeet-ctc-0.6b.nemo.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from models.nllb import nllb\n",
+    "#from models.TTS_utils import xtts_v2\n",
+    "from models.parakeet import parakeet_ctc_model\n",
+    "from models.es_fastconformer import stt_es_model\n",
+    "model_nllb, tokinizer_nllb = nllb()\n",
+    "#xtts_v2_model = xtts_v2()\n",
+    "parakeet = parakeet_ctc_model()\n",
+    "#sst = stt_es_model()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Writing audio_segments\\segment_0.wav...\n",
+      "Processing segment...\n",
+      "0.021454915\n",
+      "Noise reduction done!\n",
+      "Noise removed. Time: 0.06042814254760742\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6909654da05f4b0a88458139a9b37d6d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Transcription: hello can you hear me\n",
+      "Transcription time: 1.3255603313446045\n",
+      "Translating...\n",
+      "Processing translation...\n",
+      "Translation: Hola, ¿ me escuchas?\n",
+      "Translation time: 0.932790994644165\n",
+      "Writing audio_segments\\segment_1.wav...\n",
+      "Processing segment...\n",
+      "0.010297036\n",
+      "No speech detected.\n",
+      "Writing audio_segments\\segment_2.wav...\n",
+      "Processing segment...\n",
+      "0.006772096\n",
+      "No speech detected.\n",
+      "Writing audio_segments\\segment_3.wav...\n",
+      "Processing segment...\n",
+      "0.0034770737\n",
+      "No speech detected.\n",
+      "Writing audio_segments\\segment_4.wav...\n",
+      "Processing segment...\n",
+      "0.0039069764\n",
+      "No speech detected.\n",
+      "Writing audio_segments\\segment_5.wav...\n",
+      "Processing segment...\n",
+      "0.0046523036\n",
+      "No speech detected.\n",
+      "Writing audio_segments\\segment_6.wav...\n",
+      "Processing segment...\n",
+      "0.0040206155\n",
+      "No speech detected.\n",
+      "Writing audio_segments\\segment_7.wav...\n",
+      "Processing segment...\n",
+      "0.0043495107\n",
+      "No speech detected.\n",
+      "Writing audio_segments\\segment_8.wav...\n",
+      "Processing segment...\n",
+      "0.00421352\n",
+      "No speech detected.\n",
+      "Writing audio_segments\\segment_9.wav...\n",
+      "Processing segment...\n",
+      "0.0040656724\n",
+      "No speech detected.\n",
+      "Writing audio_segments\\segment_10.wav...\n",
+      "Processing segment...\n",
+      "0.0042125704\n",
+      "No speech detected.\n",
+      "Writing audio_segments\\segment_11.wav...\n",
+      "Processing segment...\n",
+      "0.015398192\n",
+      "Noise reduction done!\n",
+      "Noise removed. Time: 0.020929336547851562\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "de3d4b3a7bc14de2afbb01ff82252dc2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from stream_VAD import stream\n",
+    "stream(parakeet, model_nllb, tokinizer_nllb, \"english\", \"spanish\", 'record_temp.json', 'record_per.json')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fdc0440dfcaf4c9f814689fc47c10e3e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "(…)tt_es_fastconformer_hybrid_large_pc.nemo:   0%|          | 0.00/459M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-04-12 16:10:09 nemo_logging:381] Tokenizer SentencePieceTokenizer initialized with 1024 tokens\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[NeMo W 2024-04-12 16:10:10 nemo_logging:393] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
+      "    Train config : \n",
+      "    manifest_filepath: null\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 16\n",
+      "    shuffle: true\n",
+      "    num_workers: 8\n",
+      "    pin_memory: true\n",
+      "    use_start_end_token: false\n",
+      "    trim_silence: false\n",
+      "    max_duration: 20\n",
+      "    min_duration: 0.1\n",
+      "    is_tarred: false\n",
+      "    tarred_audio_filepaths: null\n",
+      "    shuffle_n: 2048\n",
+      "    bucketing_strategy: fully_randomized\n",
+      "    bucketing_batch_size: null\n",
+      "    is_concat: false\n",
+      "    concat_sampling_technique: random\n",
+      "    concat_sampling_probabilities: ''\n",
+      "    \n",
+      "[NeMo W 2024-04-12 16:10:10 nemo_logging:393] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
+      "    Validation config : \n",
+      "    manifest_filepath: null\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 32\n",
+      "    shuffle: false\n",
+      "    num_workers: 8\n",
+      "    pin_memory: true\n",
+      "    use_start_end_token: false\n",
+      "    is_concat: true\n",
+      "    concat_sampling_technique: random\n",
+      "    concat_sampling_probabilities:\n",
+      "    - 0.099\n",
+      "    - 0.2771\n",
+      "    - 0.5482\n",
+      "    - 0.0757\n",
+      "    concat_shuffle: false\n",
+      "    concat_sampling_seed: 1234\n",
+      "    max_duration: 20\n",
+      "    \n",
+      "[NeMo W 2024-04-12 16:10:10 nemo_logging:393] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).\n",
+      "    Test config : \n",
+      "    manifest_filepath: null\n",
+      "    sample_rate: 16000\n",
+      "    batch_size: 16\n",
+      "    shuffle: false\n",
+      "    num_workers: 8\n",
+      "    pin_memory: true\n",
+      "    use_start_end_token: false\n",
+      "    \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-04-12 16:10:10 nemo_logging:381] PADDING: 0\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[NeMo W 2024-04-12 16:10:11 nemo_logging:393] c:\\Users\\spn\\anaconda3\\envs\\capstone\\Lib\\site-packages\\torch\\nn\\modules\\rnn.py:83: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.2 and num_layers=1\n",
+      "      warnings.warn(\"dropout option adds dropout after all but last \"\n",
+      "    \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[NeMo I 2024-04-12 16:10:11 nemo_logging:381] Using RNNT Loss : warprnnt_numba\n",
+      "    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}\n",
+      "[NeMo I 2024-04-12 16:10:12 nemo_logging:381] Model EncDecHybridRNNTCTCBPEModel was successfully restored from C:\\Users\\spn\\.cache\\huggingface\\hub\\models--nvidia--stt_es_fastconformer_hybrid_large_pc\\snapshots\\65f775445d5947d6784c3e80d9a14d859571947f\\stt_es_fastconformer_hybrid_large_pc.nemo.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from models.es_fastconformer import stt_es_model\n",
+    "model = stt_es_model()\n",
+    "# check how much memory is used by the model\n",
+    "import torch\n",
+    "import psutil\n",
+    "import os\n",
+    "import time\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model size: 458.86 MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "# get the size of the model in term of memory in MB\n",
+    "def get_size(model):\n",
+    "    torch.save(model.state_dict(), 'temp.p')\n",
+    "    size = os.path.getsize('temp.p') / 1e6\n",
+    "    os.remove('temp.p')\n",
+    "    return size\n",
+    "size = get_size(model)\n",
+    "print(f\"Model size: {size:.2f} MB\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "capstone",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

main.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import streamlit as st
+from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
+import av
+import numpy as np
+import pydub
+from io import BytesIO
+from models.nllb import nllb
+from models.parakeet import parakeet_ctc_model
+from stream_VAD import stream
+from models.es_fastconformer import stt_es_model
+RTC_CONFIGURATION = RTCConfiguration({"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]})
+# Load models once
+model_nllb, tokenizer_nllb = nllb()
+parakeet = parakeet_ctc_model()
+stt_model = stt_es_model()
+def process_audio(audio_chunk, language):
+    # Convert audio chunk to pydub.AudioSegment
+    audio_segment = pydub.AudioSegment(
+        data=audio_chunk.tobytes(),
+        sample_width=audio_chunk.format.sample_width,
+        frame_rate=audio_chunk.sample_rate,
+        channels=len(audio_chunk.layout.channels)
+    )
+    # Process audio based on selected language
+    if language == "en":
+        processed_audio = stream(parakeet, model_nllb, tokenizer_nllb, "english", "spanish", audio_segment)
+    elif language == "es":
+        processed_audio = stream(stt_model, model_nllb, tokenizer_nllb, "spanish", "english", audio_segment)
+    else:
+        return audio_chunk
+    # Convert processed audio back to numpy array
+    processed_audio_np = np.array(processed_audio.get_array_of_samples())
+    return processed_audio.frame_rate, processed_audio_np
+def audio_callback(frame: av.AudioFrame, language):
+    audio_data = frame.to_ndarray()
+    audio_chunk = av.AudioFrame.from_ndarray(audio_data, format="s16", layout="mono")
+    return process_audio(audio_chunk, language)
+st.title("Real-Time Audio Processing")
+language = st.radio("Select Language", ["en", "es"], index=0)
+webrtc_ctx = webrtc_streamer(
+    key="audio",
+    mode=WebRtcMode.SENDRECV,
+    rtc_configuration=RTC_CONFIGURATION,
+    media_stream_constraints={"audio": True, "video": False},
+    audio_receiver_size=256,
+    async_processing=True,
+)
+if webrtc_ctx.audio_receiver:
+    webrtc_ctx.audio_receiver.on("data", lambda frame: audio_callback(frame, language))
+if "audio_buffer" not in st.session_state:
+    st.session_state["audio_buffer"] = BytesIO()
+if webrtc_ctx.audio_receiver:
+    audio_frames = webrtc_ctx.audio_receiver.get_frames()
+    for frame in audio_frames:
+        processed_audio_rate, processed_audio_np = audio_callback(frame, language)
+        audio_segment = pydub.AudioSegment(
+            data=processed_audio_np.tobytes(),
+            sample_width=processed_audio_np.dtype.itemsize,
+            frame_rate=processed_audio_rate,
+            channels=1
+        )
+        st.session_state["audio_buffer"].write(audio_segment.export(format="wav").read())
+    st.audio(st.session_state["audio_buffer"].getvalue(), format="audio/wav")

main_stream.ipynb ADDED Viewed

	@@ -0,0 +1,87 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-06-25 20:01:43,998] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2024-06-25 20:01:44,318] torch.distributed.elastic.multiprocessing.redirects: [WARNING] NOTE: Redirects are currently not supported in Windows or MacOs.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading model...\n",
+      "[2024-06-25 20:02:01,663] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.14.0+ce78a63, git-hash=ce78a63, git-branch=master\n",
+      "[2024-06-25 20:02:01,664] [WARNING] [config_utils.py:69:_process_deprecated_field] Config parameter replace_method is deprecated. This parameter is no longer needed, please remove from your call to DeepSpeed-inference\n",
+      "[2024-06-25 20:02:01,665] [WARNING] [config_utils.py:69:_process_deprecated_field] Config parameter mp_size is deprecated use tensor_parallel.tp_size instead\n",
+      "[2024-06-25 20:02:01,666] [INFO] [logging.py:96:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1\n",
+      "[2024-06-25 20:02:01,900] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed-Inference config: {'layer_id': 0, 'hidden_size': 1024, 'intermediate_size': 4096, 'heads': 16, 'num_hidden_layers': -1, 'dtype': torch.float32, 'pre_layer_norm': True, 'norm_type': <NormType.LayerNorm: 1>, 'local_rank': -1, 'stochastic_mode': False, 'epsilon': 1e-05, 'mp_size': 1, 'scale_attention': True, 'triangular_masking': True, 'local_attention': False, 'window_size': 1, 'rotary_dim': -1, 'rotate_half': False, 'rotate_every_two': True, 'return_tuple': True, 'mlp_after_attn': True, 'mlp_act_func_type': <ActivationFuncType.GELU: 1>, 'specialized_mode': False, 'training_mp_size': 1, 'bigscience_bloom': False, 'max_out_tokens': 1024, 'min_out_tokens': 1, 'scale_attn_by_inverse_layer_idx': False, 'enable_qkv_quantization': False, 'use_mup': False, 'return_single_tuple': False, 'set_empty_params': False, 'transposed_mode': False, 'use_triton': False, 'triton_autotune': False, 'num_kv': -1, 'rope_theta': 10000}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from models.TTS_utils import load_manual_xtts_v2\n",
+    "config_path = \"test/config.json\"\n",
+    "model_path = \"test\"\n",
+    "\n",
+    "xtts_v2_model = load_manual_xtts_v2(config_path, model_path)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Inference...\n",
+      "No more text to process\n",
+      "Inference...\n",
+      "No more text to process\n"
+     ]
+    }
+   ],
+   "source": [
+    "from models.TTS_utils import stream_prod\n",
+    "stream_prod(xtts_v2_model, \"record_temp.json\", \"audio_segments/\")\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "capstone",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

models/TTS_utils.py ADDED Viewed

	@@ -0,0 +1,365 @@

+import torch
+from TTS.api import TTS
+import time
+import torchaudio
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.models.xtts import Xtts
+import sounddevice as sd
+def xtts_v2():
+    """
+    Load and return the XTTS v2 model.
+    This function initializes the XTTS v2 model from the 🐸TTS library.
+    The model is configured to use a GPU if available, otherwise it defaults to CPU.
+    Returns:
+        TTS: The initialized XTTS v2 model.
+    Example usage:
+        tts = xtts_v2()
+    """
+    # Get device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # List available 🐸TTS models
+    # print(TTS().list_models())
+    # Init TTS
+    tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
+    return tts
+def load_manual_xtts_v2(config_path, checkpoint_path):
+    """
+    Load the XTTS v2 model manually with configuration and checkpoint files.
+    Args:
+        config_path (str): Path to the configuration file.
+            Example: "path/to/config.json"
+        checkpoint_path (str): Path to the checkpoint directory.
+            Example: "path/to/checkpoint/"
+    Returns:
+        Xtts: The loaded XTTS v2 model.
+    Example usage:
+        model = load_manual_xtts_v2("config.json", "checkpoint/")
+    """
+    print("Loading model...")
+    config = XttsConfig()
+    config.load_json(config_path)
+    model = Xtts.init_from_config(config)
+    model.load_checkpoint(config, checkpoint_dir=checkpoint_path, use_deepspeed=True)
+    model.cuda()
+    return model
+import json
+import concurrent.futures
+# ----------------- StreamXTTSV2 -----------------
+def get_text_order(json_path, num_elements, ):
+    """
+    Retrieve a specified number of text elements from a JSON file and update the file.
+    Args:
+        json_path (str): Path to the JSON file.
+            Example: "path/to/data.json"
+        num_elements (int): Number of elements to retrieve.
+            Example: 3
+    Returns:
+        list: A list of tuples containing text, order, original_path, path_to_save, and language.
+    Example usage:
+        text_order = get_text_order("data.json", 3)
+    """
+    with open(json_path) as f:
+        data = json.load(f)
+    # check if the data is empty
+    if not data['text']:
+        return "No more text to process"
+    if len(data['text']) < num_elements:
+        num_elements = len(data['text'])
+    text = data['text'][:num_elements]
+    order = data['order'][:num_elements]
+    original_path = data['original_path'][:num_elements]
+    path_to_save = data['path_to_save'][:num_elements]
+    language = data['language'][:num_elements]
+    # remove the first elements
+    data['text'] = data['text'][num_elements:]
+    data['order'] = data['order'][num_elements:]
+    data['original_path'] = data['original_path'][num_elements:]
+    data['path_to_save'] = data['path_to_save'][num_elements:]
+    data['language'] = data['language'][num_elements:]
+    data['original_text'] = data['original_text'][num_elements:]
+    # write the data back to the file
+    with open(json_path, 'w') as f:
+        json.dump(data, f)
+    # make it return an array of arrays of text and order
+    result = [i for i in zip(text, order, original_path, path_to_save, language)]
+    return result
+def append_text_order(json_path, text, order, original_path, path_to_save, language, original_text=None):
+    """
+    Append a text order to a JSON file.
+    Args:
+        json_path (str): Path to the JSON file.
+            Example: "path/to/data.json"
+        text (str): The text to append.
+            Example: "Hello, world!"
+        order (int): The order index.
+            Example: 1
+        original_path (str): Path to the original file.
+            Example: "path/to/original.wav"
+        path_to_save (str): Path to save the processed file.
+            Example: "path/to/save.wav"
+        language (str): Language of the text.
+            Example: "en"
+        original_text (str, optional): The original text if available.
+            Example: "Hola, mundo!"
+    Example usage:
+        append_text_order("data.json", "Hello, world!", 1, "original.wav", "save.wav", "en", "Hola, mundo!")
+    """
+    with open(json_path) as f:
+        data = json.load(f)
+    data['text'].append(text)
+    data['order'].append(order)
+    data['original_path'].append(original_path)
+    data['path_to_save'].append(path_to_save)
+    data['language'].append(language)
+    data['original_text'].append(original_text)
+    with open(json_path, 'w') as f:
+        json.dump(data, f)
+# ----------------- StreamXTTSV2 -----------------
+class StreamXTTSV2:
+    """
+    A class to handle streaming TTS using XTTS v2 model.
+    Args:
+        model (Xtts): The XTTS v2 model.
+        sample_rate (int, optional): The sample rate for audio playback. Default is 24000.
+        buffer_size (int, optional): The buffer size for audio playback. Default is 2.
+    """
+    def __init__(self, model, sample_rate=24000, buffer_size=2):
+        self.model = model
+        #self.gpt_cond_latent = gpt_cond_latent
+        #self.speaker_embedding = speaker_embedding
+        self.sample_rate = sample_rate
+        self.buffer_size = buffer_size
+        self.speed = 0.95
+        self.stream_chunk_size = 40
+        self.buffer = torch.Tensor().to('cpu')
+        self.chunk_save = torch.Tensor().to('cpu')
+        self.is_playing = False
+        self.tasks_order = []
+        self.order = 0
+        self.initial = True
+    def chunk_callback(self, chunk, i, output_dir, order):
+        """
+        Callback function to handle each chunk of audio during streaming.
+        Args:
+            chunk (torch.Tensor): The audio chunk.
+                Example: tensor([0.1, 0.2, 0.3])
+            i (int): The chunk index.
+                Example: 1
+            output_dir (str): Directory to save the chunk.
+                Example: "output/"
+            order (int): The order index.
+                Example: 1
+        """
+        # Accumulate chunk into buffer
+        self.buffer = torch.cat((self.buffer, chunk.squeeze().to('cpu')), dim=-1)
+        self.chunk_save = torch.cat((self.chunk_save, chunk.squeeze().to('cpu')), dim=-1)
+        chunk_filename = output_dir + f"chunk_{i}_{order}.wav"
+        print(self.sample_rate)
+        torchaudio.save(chunk_filename, self.chunk_save.unsqueeze(0), self.sample_rate)
+        print(f"Chunk saved as {chunk_filename}")
+        self.chunk_save = torch.Tensor().to('cpu')
+        # Check if buffer has enough chunks to start playing
+        if not self.is_playing and len(self.buffer) >= self.buffer_size:
+            self.start_playback()
+    def start_playback(self):
+        """Start audio playback."""
+        self.is_playing = True
+        sd.play(self.buffer.numpy(), self.sample_rate, blocking=False)
+        self.buffer = torch.Tensor().to('cpu')  # Reset buffer after starting playback
+    def play(self, chunks, output_dir, path_to_save, order):
+        """
+        Play the audio chunks and save the complete audio.
+        Args:
+            chunks (list): List of audio chunks.
+                Example: [tensor([0.1, 0.2, 0.3]), tensor([0.4, 0.5, 0.6])]
+            output_dir (str): Directory to save the chunks.
+                Example: "output/"
+            path_to_save (str): Path to save the complete audio file.
+                Example: "output/complete.wav"
+            order (int): The order index.
+                Example: 1
+        """
+        t0 = time.time()
+        for i, chunk in enumerate(chunks):
+            #print(chunk)
+            if i == 0:
+                print(f"Time to first chunk: {time.time() - t0}")
+            print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
+            self.chunk_callback(chunk, i, output_dir, order)
+        # Ensure all remaining audio is played
+        while sd.get_stream().active:
+            time.sleep(0.1)
+        if len(self.buffer) > 0:
+            sd.play(self.buffer.numpy(), self.sample_rate, blocking=True)
+        # Save the complete audio to a file
+        torchaudio.save(path_to_save, self.buffer.unsqueeze(0), self.sample_rate)
+        print(f"Total audio length: {self.buffer.shape[-1]}")
+        print("Audio playback finished.")
+        #self.order += 1
+    def inference_and_play(self, json_path, output_dir):
+        """
+        Perform inference and play the generated audio.
+        Args:
+            json_path (str): Path to the JSON file containing text orders.
+                Example: "path/to/data.json"
+            output_dir (str): Directory to save the chunks.
+                Example: "output/"
+        """
+        print("Inference...")
+        self.texts = get_text_order(json_path, 3)
+        if self.texts == "No more text to process":
+            print("No more text to process")
+            return
+        if self.texts == "Not enough text to process":
+            print("Not enough text to process")
+            return
+        # is it returns a list of text and order
+        if self.texts is not None:
+            #print(self.texts)
+            self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=[self.texts[0][2]])
+            path_to_save = self.texts[0][3]
+            #print(self.gpt_cond_latent, self.speaker_embedding)
+            #print(self.texts)
+            with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
+                #text, order = get_text_order(texts)
+                #print(text, order)
+                futures = []
+                print(self.texts)
+                for text, i, path_a, path_s, lang in self.texts:
+                    #print(text, i, path)
+                    print(f"Processing text {i}: {text}")
+                    print(f"Processing text {i}: {lang}")
+                    future = executor.submit(self.model.inference_stream, text, lang, self.gpt_cond_latent, self.speaker_embedding, stream_chunk_size=self.stream_chunk_size, speed=self.speed)
+                    #print(future.result())
+                    futures.append(future)
+                for future, text in zip(futures, self.texts):
+                    #print(text)
+                    chunks = future.result()
+                    print(text[1])
+                    self.play(chunks, output_dir, path_to_save, text[1])
+                    self.buffer = torch.Tensor().to('cpu')
+            self.inference_and_play(json_path, output_dir )
+def stream_prod(model, json_path, directory_path):
+    """
+    Stream production function for XTTS v2.
+    Args:
+        model (Xtts): The XTTS v2 model.
+            Example: model = load_manual_xtts_v2("config.json", "checkpoint/")
+        json_path (str): Path to the JSON file containing text orders.
+            Example: "path/to/data.json"
+        directory_path (str): Directory to save the chunks.
+            Example: "output/"
+    """
+    streamer = StreamXTTSV2(model, buffer_size=2)
+    results = streamer.inference_and_play(json_path, directory_path)
+    if results is  None:
+        time.sleep(3)
+        stream_prod(model, json_path, directory_path)
+    return "Streaming finished"
+def just_inference(model, original_path, output_dir, text, lang, order):
+    """
+    Perform inference and save the generated audio.
+    Args:
+        model (Xtts): The XTTS v2 model.
+            Example: model = load_manual_xtts_v2("config.json", "checkpoint/")
+        original_path (str): Path to the original audio file.
+            Example: "path/to/original.wav"
+        output_dir (str): Directory to save the generated audio file.
+            Example: "output/"
+        text (str): The text to be synthesized.
+            Example: "Hello, world!"
+        lang (str): The language of the text.
+            Example: "en"
+        order (int): The order index.
+            Example: 1
+    Returns:
+        tuple: A tuple containing the path to the saved audio file and the time to first chunk.
+            Example: ("output/complete.wav", 1.23)
+    """
+    print("Inference...")
+    path_to_save = output_dir
+    t0 = time.time()
+    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[original_path])
+    chunks = model.inference_stream(
+        text,
+        lang,
+        gpt_cond_latent,
+        speaker_embedding,
+        stream_chunk_size= 15 ,
+        speed=0.95
+        #temperature=0.1,
+        #enable_text_splitting=True,
+    )
+    full_audio = torch.Tensor().to('cpu')
+    wav_chuncks = []
+    for i, chunk in enumerate(chunks):
+        if i == 1:
+            time_to_first_chunk = time.time() - t0
+            print(f"Time to first chunck: {time_to_first_chunk}")
+        print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
+        wav_chuncks.append(chunk)
+        full_audio = torch.cat((full_audio, chunk.squeeze().to('cpu')), dim=-1)
+    # Save the complete audio to a file
+    torchaudio.save(path_to_save, full_audio.unsqueeze(0), 24000)
+    print("Inference finished")
+    return path_to_save, time_to_first_chunk

models/__init__.py ADDED Viewed

File without changes

models/__pycache__/TTS_utils.cpython-311.pyc ADDED Viewed

Binary file (18.2 kB). View file

models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (173 Bytes). View file

models/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (155 Bytes). View file

models/__pycache__/es_fastconformer.cpython-311.pyc ADDED Viewed

Binary file (1.98 kB). View file

models/__pycache__/nllb.cpython-311.pyc ADDED Viewed

Binary file (3.84 kB). View file

models/__pycache__/nllb.cpython-38.pyc ADDED Viewed

Binary file (2.49 kB). View file

models/__pycache__/noise_red.cpython-311.pyc ADDED Viewed

Binary file (1.3 kB). View file

models/__pycache__/parakeet.cpython-311.pyc ADDED Viewed

Binary file (2.08 kB). View file

models/__pycache__/parakeet.cpython-38.pyc ADDED Viewed

Binary file (1.69 kB). View file

models/es_fastconformer.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import nemo.collections.asr as nemo_asr
+import torch
+def stt_es_model():
+    """
+    Load and return the pre-trained Spanish ASR model.
+    This function loads the pre-trained EncDecCTCModelBPE model from NVIDIA's NeMo collection.
+    The model is configured to use a GPU if available, otherwise it defaults to CPU.
+    Returns:
+        nemo_asr.models.EncDecCTCModelBPE: The loaded ASR model.
+            Example usage:
+                asr_model = stt_es_model()
+    """
+    # Load the pre-trained model
+    asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained("nvidia/stt_es_fastconformer_hybrid_large_pc")
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    asr_model = asr_model.to(device)
+    return asr_model
+def stt_es_process(asr_model, audio_file):
+    """
+    Transcribe an audio file using the given ASR model.
+    Args:
+        asr_model (nemo_asr.models.EncDecCTCModelBPE): The ASR model to use for transcription.
+            Example: asr_model = stt_es_model()
+        audio_file (str): Path to the audio file to be transcribed.
+            Example: "path/to/audio_file.wav"
+    Returns:
+        list: A list containing the transcribed text.
+            Example: ["transcribed text"]
+    """
+    text = asr_model.transcribe(paths2audio_files=[audio_file], batch_size=1)
+    return text

models/nllb.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+import torch
+def nllb():
+    """
+    Load and return the NLLB (No Language Left Behind) model and tokenizer.
+    This function loads the NLLB-200-distilled-1.3B model and tokenizer from Hugging Face's Transformers library.
+    The model is configured to use a GPU if available, otherwise it defaults to CPU.
+    Returns:
+        tuple: A tuple containing the loaded model and tokenizer.
+            - model (transformers.AutoModelForSeq2SeqLM): The loaded NLLB model.
+            - tokenizer (transformers.AutoTokenizer): The loaded tokenizer.
+    Example usage:
+        model, tokenizer = nllb()
+    """
+    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    device = torch.device("cpu")
+    # Load the tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-1.3B")
+    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-1.3B").to(device)
+    # write done to the file named status.txt
+    with open("status.txt", 'w') as f:
+        f.write("done")
+    return model, tokenizer
+def nllb_translate(model, tokenizer, article, language):
+    """
+    Translate an article using the NLLB model and tokenizer.
+    Args:
+        model (transformers.AutoModelForSeq2SeqLM): The NLLB model to use for translation.
+            Example: model, tokenizer = nllb()
+        tokenizer (transformers.AutoTokenizer): The tokenizer to use with the NLLB model.
+            Example: model, tokenizer = nllb()
+        article (str): The article text to be translated.
+            Example: "This is a sample article."
+        language (str): The target language for translation. Must be either 'spanish' or 'english'.
+            Example: "spanish"
+    Returns:
+        str: The translated text.
+            Example: "Este es un artículo de muestra."
+    """
+    try:
+        # Tokenize the text
+        inputs = tokenizer(article, return_tensors="pt")
+        # Move the tokenized inputs to the same device as the model
+        inputs = {k: v.to(model.device) for k, v in inputs.items()}
+        if language == "es":
+            translated_tokens = model.generate(
+                **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["spa_Latn"], max_length=30
+            )
+        elif language == "en":
+            translated_tokens = model.generate(
+                **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=30
+            )
+        else:
+            raise ValueError("Unsupported language. Use 'es' or 'en'.")
+        # Decode the translation
+        text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+        return text
+    except Exception as e:
+        print(f"Error during translation: {e}")
+        return "Translation failed"

models/noise_red.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from scipy.io import wavfile
+import noisereduce as nr
+# Load your data
+def noise_reduction(path, new_path):
+    """
+    Perform noise reduction on an audio file and save the output.
+    This function reads an audio file from the given path, performs noise reduction using the noisereduce library,
+    and saves the processed audio to a new file.
+    Args:
+        path (str): Path to the input audio file.
+            Example: "path/to/input_audio.wav"
+        new_path (str): Path to save the processed audio file.
+            Example: "path/to/output_audio.wav"
+    Returns:
+        None
+    Example usage:
+        noise_reduction("input.wav", "output.wav")
+    """
+    rate, data = wavfile.read(path)
+    # Perform noise reduction
+    reduced_noise = nr.reduce_noise(y=data, sr=rate)
+    wavfile.write(new_path, rate, reduced_noise)
+    return print("Noise reduction done!")

models/parakeet.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import nemo.collections.asr as nemo_asr
+import torch
+def parakeet_ctc_model():
+    """
+    Load and return the pre-trained Parakeet CTC model.
+    This function loads the pre-trained EncDecCTCModelBPE model from NVIDIA's NeMo collection.
+    The model is configured to use a GPU if available, otherwise it defaults to CPU.
+    Returns:
+        nemo_asr.models.EncDecCTCModelBPE: The loaded ASR model.
+    Example usage:
+        asr_model = parakeet_ctc_model()
+    """
+    # Load the pre-trained model
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained("nvidia/parakeet-ctc-0.6b")
+    asr_model = asr_model.to(device)
+    return asr_model
+def parakeet_ctc_process(asr_model, audio_file):
+    """
+    Transcribe an audio file using the given Parakeet CTC ASR model.
+    Args:
+        asr_model (nemo_asr.models.EncDecCTCModelBPE): The ASR model to use for transcription.
+            Example: asr_model = parakeet_ctc_model()
+        audio_file (str): Path to the audio file to be transcribed.
+            Example: "path/to/audio_file.wav"
+    Returns:
+        list: A list containing the transcribed text.
+            Example: ["transcribed text"]
+    Example usage:
+        text = parakeet_ctc_process(asr_model, "path/to/audio_file.wav")
+    """
+    text = asr_model.transcribe(paths2audio_files=[audio_file], batch_size=1)
+    return text

models/status.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ done

record_per.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"text": ["Hola, \u00bf c\u00f3mo est\u00e1s?", "Est\u00e1 bien, intent\u00e9moslo de nuevo.", "As\u00ed que este modelo debe capturar esto y traducir inmediatamente", "As\u00ed que este modelo debe capturar esto y traducir inmediatamente", "Hola, \u00bf qu\u00e9 pasa?", "Hola, \u00bf qu\u00e9 pasa?", "Est\u00e1 bien, est\u00e1 bien, est\u00e1 bien, est\u00e1 bien, est\u00e1 bien, est\u00e1 bien, est\u00e1 bien, est\u00e1 bien."], "original_path": ["audio-transcription/uploads/5\\audio.wav", "audio-transcription/uploads/1\\audio.wav", "audio-transcription/uploads/2\\audio.wav", "audio-transcription/uploads/3\\audio.wav", "audio-transcription/uploads/4\\audio.wav", "audio-transcription/uploads/5\\audio.wav", "audio-transcription/uploads/6\\audio.wav"], "order": [0, 0, 0, 0, 0, 0, 0], "path_to_save": ["results", "results", "results", "results", "results/", "results/", "results/"], "language": ["es", "es", "es", "es", "es", "es", "es"], "original_text": ["hello how are you", "okay let's try it again", " so this model should capture this and translate right away", " so this model should capture this and translate right away", "hello", "hello", "okay okay okay okay okay okay okay okay okay okay okay okay okay okay"]}

record_temp.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"text": ["Est\u00e1 bien, intent\u00e9moslo de nuevo.", "As\u00ed que este modelo debe capturar esto y traducir inmediatamente", "As\u00ed que este modelo debe capturar esto y traducir inmediatamente", "Hola, \u00bf qu\u00e9 pasa?", "Hola, \u00bf qu\u00e9 pasa?", "Est\u00e1 bien, est\u00e1 bien, est\u00e1 bien, est\u00e1 bien, est\u00e1 bien, est\u00e1 bien, est\u00e1 bien, est\u00e1 bien."], "original_path": ["audio-transcription/uploads/1\\audio.wav", "audio-transcription/uploads/2\\audio.wav", "audio-transcription/uploads/3\\audio.wav", "audio-transcription/uploads/4\\audio.wav", "audio-transcription/uploads/5\\audio.wav", "audio-transcription/uploads/6\\audio.wav"], "order": [0, 0, 0, 0, 0, 0], "path_to_save": ["results/", "results/", "results/", "results/", "results/", "results/"], "language": ["es", "es", "es", "es", "es", "es"], "original_text": ["okay let's try it again", " so this model should capture this and translate right away", " so this model should capture this and translate right away", "hello", "hello", "okay okay okay okay okay okay okay okay okay okay okay okay okay okay"]}

requirements.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+language_tool_python
+noisereduce
+numpy
+pandas
+pydub
+#scikit_learn==1.4.0
+scipy
+speechbrain
+webrtcvad==2.0.10
+deepspeed==0.14.0
+transformers==4.40.2
+hydra-core
+pytorch_lightning
+streamlit
+sounddevice
+playsound
+streamlit-webrtc
+pybind11
+fasttext
+Cython
+# nemo_toolkit[all]==1.21
+fastapi
+uvicorn
+pydantic==1.10.9
+spacy

results/readme ADDED Viewed

File without changes

run.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import threading
+import argparse
+import subprocess
+from models.nllb import nllb
+from models.parakeet import parakeet_ctc_model
+from models.es_fastconformer import stt_es_model
+from models.TTS_utils import load_manual_xtts_v2
+from stream_VAD import stream
+def main(xtts_path, xtts_config_path, language="en", record_temp="record_temp.json", record_per="record_per.json", record_path="audio_segments/", result_dir="results", segments_dir="audio_segments"):
+    """
+    Main function to run the ASR stream and initiate the TTS stream production.
+    Args:
+        xtts_path (str): Path to the xtts model file.
+            Example: "path/to/xtts_model.pt"
+        xtts_config_path (str): Path to the xtts configuration file.
+            Example: "path/to/xtts_config.json"
+        language (str, optional): Language for the ASR model. Must be either 'en' for English or 'es' for Spanish.
+            Default: 'en'
+            Example: "en"
+        record_temp (str, optional): Path to the temporary record JSON file.
+            Default: "record_temp.json"
+            Example: "path/to/record_temp.json"
+        record_per (str, optional): Path to the periodic record JSON file.
+            Default: "record_per.json"
+            Example: "path/to/record_per.json"
+        record_path (str, optional): Path to the directory where audio segments are recorded.
+            Default: "audio_segments/"
+            Example: "path/to/audio_segments/"
+        result_dir (str, optional): Path to the directory where results are stored.
+            Default: "results"
+            Example: "path/to/results"
+        segments_dir (str, optional): Path to the directory where audio segments are stored.
+            Default: "audio_segments"
+            Example: "path/to/audio_segments"
+    """
+    model_nllb, tokinizer_nllb = nllb()
+    if language == "en":
+        asr = parakeet_ctc_model()
+        stream_thread = threading.Thread(target=stream, args=(asr, model_nllb, tokinizer_nllb, "english", "spanish", record_temp, record_per, result_dir, segments_dir))
+    elif language == "es":
+        asr = stt_es_model()
+        stream_thread = threading.Thread(target=stream, args=(asr, model_nllb, tokinizer_nllb, "spanish", "english", record_temp, record_per, result_dir, segments_dir))
+    else:
+        raise ValueError("Language not supported")
+    # Start the stream thread
+    stream_thread.start()
+    # Call the other script to start stream_prod
+    subprocess.Popen(['python', 'stream_prod_main.py', xtts_path, xtts_config_path, record_temp, record_path])
+    # Wait for the stream thread to complete
+    stream_thread.join()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run stream and initiate stream_prod.")
+    parser.add_argument("xtts_path", type=str, help="Path to the xtts model.")
+    parser.add_argument("xtts_config_path", type=str, help="Path to the xtts config.")
+    parser.add_argument("language", type=str, choices=["en", "es"], help="Language (en or es).")
+    parser.add_argument("--record_temp", type=str, default="record_temp.json", help="Path to the record temp file.")
+    parser.add_argument("--record_per", type=str, default="record_per.json", help="Path to the record per file.")
+    parser.add_argument("--record_path", type=str, default="audio_segments/", help="Path to the record directory.")
+    parser.add_argument("--result_dir", type=str, default="results", help="Path to the result directory.")
+    parser.add_argument("--segments_dir", type=str, default="audio_segments", help="Path to the segments directory.")
+    args = parser.parse_args()
+    main(args.xtts_path, args.xtts_config_path, args.language, args.record_temp, args.record_per, args.record_path, args.result_dir, args.segments_dir)

setup.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/bash
+git clone https://github.com/coqui-ai/TTS/ && \
+cd TTS && \
+make install
+pip install PyAudio-0.2.11-cp37-cp37m-win_amd64.whl
+pip install pybind11
+pip install wheel setuptools pip --upgrade
+pip install fasttext
+apt-get update && apt-get install -y libsndfile1 ffmpeg
+pip install Cython
+# pip install nemo_toolkit['all']
+# show the version of nemo in python
+python -c "import nemo; print(nemo.__version__)"
+pip install torch==2.2.2+cu121 -f https://download.pytorch.org/whl/cu121/torch_stable.html
+pip install torchaudio==2.2.2+cu121 -f https://download.pytorch.org/whl/cu121/torch_stable.html
+pip install -r requirements.txt

status.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ done

stream_VAD.py ADDED Viewed

	@@ -0,0 +1,249 @@

+import collections
+import contextlib
+import wave
+import webrtcvad
+import pyaudio
+import os
+import librosa
+import numpy as np
+from models.nllb import nllb_translate
+from models.TTS_utils import append_text_order
+from models.parakeet import parakeet_ctc_process
+from models.es_fastconformer import stt_es_process
+from concurrent.futures import ThreadPoolExecutor
+import time
+from models.noise_red import noise_reduction
+class Frame(object):
+    """
+    Represents a "frame" of audio data.
+    Args:
+        bytes (bytes): The audio data.
+        timestamp (float): The timestamp of the frame.
+        duration (float): The duration of the frame.
+    """
+    def __init__(self, bytes, timestamp, duration):
+        self.bytes = bytes
+        self.timestamp = timestamp
+        self.duration = duration
+def read_audio(stream, frame_duration_ms, rate):
+    """
+    Generates audio frames from the input stream.
+    Args:
+        stream (pyaudio.Stream): The audio stream.
+        frame_duration_ms (int): Duration of each frame in milliseconds.
+        rate (int): The sample rate of the audio.
+    Yields:
+        bytes: The audio frames.
+    """
+    frames_per_buffer = int(rate * frame_duration_ms / 1000)
+    while True:
+        yield stream.read(frames_per_buffer)
+def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames):
+    """
+    Filters out non-voiced audio frames.
+    Args:
+        sample_rate (int): The sample rate of the audio.
+        frame_duration_ms (int): Duration of each frame in milliseconds.
+        padding_duration_ms (int): Duration of padding in milliseconds.
+        vad (webrtcvad.Vad): The VAD object.
+        frames (generator): A generator yielding audio frames.
+    Yields:
+        bytes: Voiced audio frames.
+    """
+    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
+    ring_buffer = collections.deque(maxlen=num_padding_frames)
+    triggered = False
+    voiced_frames = []
+    for frame in frames:
+        is_speech = vad.is_speech(frame.bytes, sample_rate)
+        if not triggered:
+            ring_buffer.append((frame, is_speech))
+            num_voiced = len([f for f, speech in ring_buffer if speech])
+            if num_voiced > 0.9 * ring_buffer.maxlen:
+                triggered = True
+                voiced_frames.extend(f for f, speech in ring_buffer)
+                ring_buffer.clear()
+        else:
+            voiced_frames.append(frame)
+            ring_buffer.append((frame, is_speech))
+            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
+            if num_unvoiced > 0.9 * ring_buffer.maxlen:
+                yield b''.join([f.bytes for f in voiced_frames])
+                ring_buffer.clear()
+                voiced_frames = []
+                triggered = False
+    if voiced_frames:
+        yield b''.join([f.bytes for f in voiced_frames])
+def is_segment_empty(file_path):
+    """
+    Check if the audio segment is empty.
+    Args:
+        file_path (str): Path to the audio file.
+    Returns:
+        bool: True if the segment is empty, False otherwise.
+    """
+    audio, _ = librosa.load(file_path)
+    rms = librosa.feature.rms(y=audio)  # Pass the audio data as an argument
+    rms_mean = np.mean(rms)
+    print(rms_mean)
+    if rms_mean < 0.015:
+        return True
+    else:
+        return False
+def process_segment(asr_model, model_nllb, tokenizer_nllb, path_segments, path_results, target_lang, order, json_path_temp, json_path_record):
+    """
+    Process an audio segment: noise reduction, transcription, translation, and append results.
+    Args:
+        asr_model: The ASR model for transcription.
+        model_nllb: The NLLB model for translation.
+        tokenizer_nllb: The tokenizer for the NLLB model.
+        path_segments (str): Path to the audio segment.
+        path_results (str): Path to save the results.
+        target_lang (str): Target language for translation.
+        order (int): Order index of the segment.
+        json_path_temp (str): Path to the temporary JSON file.
+        json_path_record (str): Path to the record JSON file.
+    """
+    print("Processing segment...")
+    if is_segment_empty(path_segments):
+        print("No speech detected.")
+        # remove the empty segment
+        os.remove(path_segments)
+        return
+    # Noise Reduction
+    start_time = time.time()
+    noise_reduction(path_segments, path_segments)
+    print("Noise removed. Time:", time.time() - start_time)
+    # Transcription
+    transcription = transcribe(asr_model, path_segments, target_lang)
+    #if not transcription.strip():
+    #    print("No speech detected.")
+    #    return
+    # Translation
+    print("Translating...")
+    translation = translate(model_nllb, tokenizer_nllb, transcription, target_lang)
+    # Text-to-Speech
+    # process_tts(tts_model, translation, path_segments, target_lang, path_results)
+    append_text_order(json_path_temp,translation, order, path_segments, path_results, "es" if target_lang == "spanish" else "en", transcription)
+    append_text_order(json_path_record,translation, order, path_segments, path_results, "es" if target_lang == "spanish" else "en", transcription)
+def transcribe(asr_model, path_segments, target_lang):
+    """
+    Transcribe an audio segment using the specified ASR model.
+    Args:
+        asr_model: The ASR model for transcription.
+        path_segments (str): Path to the audio segment.
+        target_lang (str): Target language for transcription.
+    Returns:
+        str: The transcription of the audio segment.
+    """
+    start_time = time.time()
+    transcription_func = {
+        "spanish": parakeet_ctc_process,
+        "english": stt_es_process
+    }[target_lang]
+    transcription = transcription_func(asr_model, path_segments)
+    print("Transcription:", transcription[0])
+    print("Transcription time:", time.time() - start_time)
+    return transcription[0]
+def translate(model_nllb, tokenizer_nllb, text, target_lang):
+    """
+    Translate text using the specified NLLB model and tokenizer.
+    Args:
+        model_nllb: The NLLB model for translation.
+        tokenizer_nllb: The tokenizer for the NLLB model.
+        text (str): The text to translate.
+        target_lang (str): Target language for translation.
+    Returns:
+        str: The translated text.
+    """
+    print("Processing translation...")
+    start_time = time.time()
+    translation = nllb_translate(model_nllb, tokenizer_nllb, text, target_lang)
+    print("Translation:", translation)
+    print("Translation time:", time.time() - start_time)
+    return translation
+def stream(asr_model, model_nllb, tokinizer_nllb, source_lang, target_lang, json_file_temp, json_file_record,result_dir = "results",segments_dir = "audio_segments"):
+    """
+    Stream audio input, process segments, and save the results.
+    Args:
+        asr_model: The ASR model for transcription.
+        model_nllb: The NLLB model for translation.
+        tokinizer_nllb: The tokenizer for the NLLB model.
+        source_lang (str): Source language of the audio.
+        target_lang (str): Target language for translation.
+        json_file_temp (str): Path to the temporary JSON file.
+        json_file_record (str): Path to the record JSON file.
+        result_dir (str, optional): Directory to save the results. Default is "results".
+        segments_dir (str, optional): Directory to save the audio segments. Default is "audio_segments".
+    """
+    FORMAT = pyaudio.paInt16
+    CHANNELS = 1
+    RATE = 16000
+    CHUNK_DURATION_MS = 30  # supports 10, 20 and 30 (ms)
+    PADDING_DURATION_MS = 300
+    vad = webrtcvad.Vad(1)
+    audio = pyaudio.PyAudio()
+    stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=160)
+    frames = read_audio(stream, CHUNK_DURATION_MS, RATE)
+    frames = (Frame(f, None, None) for f in frames)
+    if not os.path.exists(segments_dir):
+        os.makedirs(segments_dir)
+    if not os.path.exists(result_dir):
+        os.makedirs(result_dir)
+    executor = ThreadPoolExecutor(max_workers=2)  # Adjust the number of workers as per your requirement
+    for i, segment in enumerate(vad_collector(RATE, CHUNK_DURATION_MS, PADDING_DURATION_MS, vad, frames)):
+        path_segements = os.path.join(segments_dir, f"segment_{i}.wav")
+        path_results = os.path.join(result_dir, f"result_{i}.wav")
+        print(f"Writing {path_segements}...")
+        with contextlib.closing(wave.open(path_segements, 'wb')) as wf:
+            wf.setnchannels(CHANNELS)
+            wf.setsampwidth(audio.get_sample_size(FORMAT))
+            wf.setframerate(RATE)
+            wf.writeframes(segment)
+        executor.submit(process_segment, asr_model, model_nllb, tokinizer_nllb, path_segements,path_results, target_lang, i, json_file_temp, json_file_record)
+    stream.stop_stream()
+    stream.close()
+    audio.terminate()