Posts: 19
Threads: 2
Joined: Apr 2024
Apr-03-2024, 12:42 PM
(This post was last modified: Apr-09-2024, 09:04 AM by slain.)
Hello,
I'm quite new to Python, and trying to get a working transcriber with Streamlit and HuggingFace models.
I have an hardware constraint, my script must work on a 4GB Nvidia GPU...
Here is the code at this moment:
## Imports ##
import torch
import io
import streamlit as st
from pathlib import Path
from tempfile import NamedTemporaryFile
from transformers import AutoModelForCTC, Wav2Vec2ProcessorWithLM
import nemo.collections.asr as nemo_asr
import torchaudio
## Initialization ##
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_name = "bofenghuang/stt_fr_fastconformer_hybrid_large"
asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name=model_name)
## Display ##
st.title("Transcribe audio to text")
col1, col2 = st.columns(2)
audio_source=st.sidebar.file_uploader(label="Choose file", type=["wav","m4a","mp3","wma"])
## Variables ##
suffix = ""
predicted_text = ""
## Processing ##
if audio_source is not None:
suffix = Path(audio_source.name).suffix
col1.write("Starting process")
with NamedTemporaryFile(suffix=suffix) as temp_file:
temp_file.write(audio_source.getvalue())
temp_file.seek(0)
predicted_text = asr_model.transcribe(temp_file)
col2.write("Transcribed text :")
col2.write(predicted_text)
st.sidebar.download_button(label="Download text", data=predicted_text, file_name="transcript.txt",mime="text/plain") Getting this by a temporary file gives me this error :
TypeError: object of type '_TemporaryFileWrapper' has no len() I guess I should convert my audio content to another type, but I don't know to which type I should convert, how I should convert it, and how I'll get the contents with the new type.
Does anyone has an idea what I miss?
Thank you
Posts: 6,250
Threads: 16
Joined: Feb 2020
Apr-03-2024, 01:01 PM
(This post was last modified: Apr-03-2024, 01:09 PM by deanhystad.)
Please post entire error message including traceback.
Posts: 19
Threads: 2
Joined: Apr 2024
Apr-03-2024, 01:13 PM
(This post was last modified: Apr-03-2024, 01:13 PM by slain.)
I tried some changes in my code, the traceback changed :
FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmppvkyb7kx'
Traceback:
File "/home/ild/miniconda3/envs/transcript/lib/python3.10/site-packages/streamlit/runtime/scriptrunner/script_runner.py", line 542, in _run_script
exec(code, module.__dict__)
File "/home/ild/secrétaire.py", line 14, in <module>
asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name=model_name)
File "/home/ild/miniconda3/envs/transcript/lib/python3.10/site-packages/nemo/core/classes/common.py", line 863, in from_pretrained
instance = class_.restore_from(
File "/home/ild/miniconda3/envs/transcript/lib/python3.10/site-packages/nemo/core/classes/modelPT.py", line 449, in restore_from
instance = cls._save_restore_connector.restore_from(
File "/home/ild/miniconda3/envs/transcript/lib/python3.10/site-packages/nemo/core/connectors/save_restore_connector.py", line 241, in restore_from
loaded_params = self.load_config_and_state_dict(
File "/home/ild/miniconda3/envs/transcript/lib/python3.10/site-packages/nemo/core/connectors/save_restore_connector.py", line 171, in load_config_and_state_dict
os.chdir(cwd) Here is the code (I don't remeber what I changed) :
## Imports ##
import torch
import io
import streamlit as st
from pathlib import Path
from tempfile import NamedTemporaryFile
from transformers import AutoModelForCTC, Wav2Vec2ProcessorWithLM
import nemo.collections.asr as nemo_asr
import torchaudio
## Initialisation ##
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_name = "bofenghuang/stt_fr_fastconformer_hybrid_large"
asr_model = nemo_asr.models.ASRModel.from_pretrained(model_name=model_name)
## Affichage ##
st.title("Facilitateur de compte-rendus")
col1, col2 = st.columns(2)
audio_source=st.sidebar.file_uploader(label="Choisir votre fichier", type=["wav","m4a","mp3","wma"])
## Variables ##
suffix = ""
predicted_text = ""
## Traitement ##
#col1.subheader("Modèle utilisé : nvidia/stt_fr_conformer_ctc_large")
if audio_source is not None:
suffix = Path(audio_source.name).suffix
col1.write("Démarrage de la transcription")
# buf = audio_source.getvalue()
# predicted_text = asr_model.transcribe(buf)
with NamedTemporaryFile(suffix=suffix) as temp_file:
temp_file.write(audio_source.getvalue())
temp_file.seek(0)
predicted_text = asr_model.transcribe(temp_file)
col2.write("Fichier transcrit :")
col2.write(predicted_text)
st.sidebar.download_button(label="Télécharger la transcription", data=predicted_text, file_name="transcript.txt",mime="text/plain") This time I let the comments to keep the same line numbers.
Posts: 6,250
Threads: 16
Joined: Feb 2020
Looking at the nemo asr documentation, transcribe takes a file name, not a file object. For real-time transcription you can use buffered streaming. For buffered streaming you pass a numpy array to pass audio information to be transcribed.
Posts: 19
Threads: 2
Joined: Apr 2024
So, as I understand, I should modify my line 35 as follows:
predicted_text = asr_model.transcribe(temp_file.name) I tried it and added a check to get the filename I'm passing to the transcribe function :
col2.write(temp_file.name) The write returns /tmp/tmpbykj1n96.mp3, so it looks good...
But the transcribe function throws me an error "Is a directory: '/'"
IsADirectoryError: [Errno 21] Is a directory: '/'
Traceback:
File "/home/ild/miniconda3/envs/transcript/lib/python3.10/site-packages/streamlit/runtime/scriptrunner/script_runner.py", line 542, in _run_script
exec(code, module.__dict__)
File "/home/ild/trans-nemo.py", line 33, in <module>
predicted_text = asr_model.transcribe(temp_file.name)
File "/home/ild/miniconda3/envs/transcript/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/home/ild/miniconda3/envs/transcript/lib/python3.10/site-packages/nemo/collections/asr/models/ctc_models.py", line 201, in transcribe
for test_batch in tqdm(temporary_datalayer, desc="Transcribing", disable=not verbose):
File "/home/ild/.local/lib/python3.10/site-packages/tqdm/std.py", line 1181, in __iter__
for obj in iterable:
File "/home/ild/miniconda3/envs/transcript/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 631, in __next__
data = self._next_data()
File "/home/ild/miniconda3/envs/transcript/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 675, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/home/ild/miniconda3/envs/transcript/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/home/ild/miniconda3/envs/transcript/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/home/ild/miniconda3/envs/transcript/lib/python3.10/site-packages/nemo/collections/asr/data/audio_to_text.py", line 477, in __getitem__
features = self.featurizer.process(
File "/home/ild/miniconda3/envs/transcript/lib/python3.10/site-packages/nemo/collections/asr/parts/preprocessing/features.py", line 186, in process
audio = AudioSegment.from_file(
File "/home/ild/miniconda3/envs/transcript/lib/python3.10/site-packages/nemo/collections/asr/parts/preprocessing/segment.py", line 259, in from_file
samples = Audio.from_file(audio_file)
File "/home/ild/miniconda3/envs/transcript/lib/python3.10/site-packages/pydub/audio_segment.py", line 651, in from_file
file, close_file = _fd_or_path_or_tempfile(file, 'rb', tempfile=False)
File "/home/ild/miniconda3/envs/transcript/lib/python3.10/site-packages/pydub/utils.py", line 60, in _fd_or_path_or_tempfile
fd = open(fd, mode=mode) I don't understand from where the '/' comes, as I'm calling asr_model.transcribe(/tmp/tmpbykj1n96.mp3) ?
The only thing I'm sure about is that I don't need real-time transcription, getting microphone recording from my user's device, we can record via Teams or via Windows 10/11 voice recorder (or a smartphone's recorder), then we can transcribe later.
Posts: 6,250
Threads: 16
Joined: Feb 2020
I missed that transcribe is passed a list of file names. They have a 3 line example here:
https://docs.nvidia.com/deeplearning/nem...intro.html
import nemo.collections.asr as nemo_asr
asr_model = nemo_asr.models.ASRModel.from_pretrained("stt_en_fastconformer_transducer_large")
transcript = asr_model.transcribe(["path/to/audio_file.wav"]) It appears that you need to pass a list. Now the earlier error about "TypeError: object of type '_TemporaryFileWrapper' has no len()" makes more sense.
You should close your file before calling transcribe.
It might make more sense for you to pass the audio data directly instead of writing to a file and transcribing the file.
Posts: 19
Threads: 2
Joined: Apr 2024
So, I should try predicted_text = asr_model.transcribe(Path(audio_source.name)) ?
I'm not sure because audio_source is a streamlit file uploader, not directly the audio data...
Quote:It might make more sense for you to pass the audio data directly instead of writing to a file and transcribing the file.
I think you're right, ti would help be to simplify the code (no more need to check the suffix, nor to put the data in a temporary file, one less import), just I'm a newbie and I don't know how to convert from uploader to audio data :(
Posts: 6,250
Threads: 16
Joined: Feb 2020
To pass the file, you need to put the file name in a list
predicted_text = asr_model.transcribe([audio_source.name])) From that I gather you can transcribe multiple audio files into one result.
Posts: 19
Threads: 2
Joined: Apr 2024
I'll test tomorrow, will keep you informed :)
Posts: 19
Threads: 2
Joined: Apr 2024
Hello,
I just tested, it gives me another error :
FileNotFoundError: [Errno 2] No such file or directory: 'Test-transcription.mp3'
Traceback:
File "/home/ild/.local/lib/python3.12/site-packages/streamlit/runtime/scriptrunner/script_runner.py", line 542, in _run_script
exec(code, module.__dict__)
File "/home/ild/trans-nemo.py", line 28, in <module>
predicted_text = asr_model.transcribe([audio_source.name])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ild/.local/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/ild/miniconda3/lib/python3.12/site-packages/nemo/collections/asr/models/ctc_models.py", line 201, in transcribe
for test_batch in tqdm(temporary_datalayer, desc="Transcribing", disable=not verbose):
File "/home/ild/miniconda3/lib/python3.12/site-packages/tqdm/std.py", line 1178, in __iter__
for obj in iterable:
File "/home/ild/.local/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 631, in __next__
data = self._next_data()
^^^^^^^^^^^^^^^^^
File "/home/ild/.local/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 675, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ild/.local/lib/python3.12/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
~~~~~~~~~~~~^^^^^
File "/home/ild/miniconda3/lib/python3.12/site-packages/nemo/collections/asr/data/audio_to_text.py", line 477, in __getitem__
features = self.featurizer.process(
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ild/miniconda3/lib/python3.12/site-packages/nemo/collections/asr/parts/preprocessing/features.py", line 186, in process
audio = AudioSegment.from_file(
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ild/miniconda3/lib/python3.12/site-packages/nemo/collections/asr/parts/preprocessing/segment.py", line 259, in from_file
samples = Audio.from_file(audio_file)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ild/miniconda3/lib/python3.12/site-packages/pydub/audio_segment.py", line 651, in from_file
file, close_file = _fd_or_path_or_tempfile(file, 'rb', tempfile=False)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ild/miniconda3/lib/python3.12/site-packages/pydub/utils.py", line 60, in _fd_or_path_or_tempfile
fd = open(fd, mode=mode)
^^^^^^^^^^^^^^^^^^^ Maybe I should give it the Path to the audio file?
Something like :
predicted_text = asr_model.transcribe([Path(audio_source.name)])
|