Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions ml/scripts/intent_prediction/01_generate_phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
for fine-tuning an LLM to handle remote control commands as a fallback.
"""

import xml.etree.ElementTree as ET
import csv
import random
import re
Expand Down Expand Up @@ -95,7 +94,7 @@ def __init__(self, target_samples: int):
self.target_samples = target_samples
(self.generated, self.existing_variations) = self.load_existing_variations(OUTPUT_FILE)

def load_existing_variations(self, filepath: Path) -> (set | List[Dict[str, str]]):
def load_existing_variations(self, filepath: Path) -> Tuple[set, List[Dict[str, str]]]:
"""Load existing variations from a CSV file to avoid duplicates."""
variations = []
generated_keys = set()
Expand Down
21 changes: 10 additions & 11 deletions ml/scripts/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
edge-tts
pandas
tqdm
pydub
soundfile
librosa
tensorflow
tensorflow.keras
onnx
tf2onnx
jiwer
edge-tts==6.1.9
pandas==2.2.0
tqdm==4.66.1
pydub==0.25.1
soundfile==0.12.1
librosa==0.10.1
tensorflow==2.15.0
onnx==1.17.0
tf2onnx==1.16.1
jiwer==3.0.3
6 changes: 4 additions & 2 deletions ml/scripts/speech_to_text/01_generate_speech_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@

# Functions
async def generate_samples():
for count, idx in enumerate(tqdm(subsampled_indices, desc="Generating speech samples")):
count = 0
for idx in tqdm(subsampled_indices, desc="Generating speech samples"):
output_path = paths.output_dir / phrases_df.iloc[idx]['sample_file_name']
if output_path.exists():
continue # Skip if samples already exist for this phrase index
Expand All @@ -39,6 +40,7 @@ async def generate_samples():
rate=speech_rate_str,
)
await communicate.save(str(output_path))
count += 1
except Exception as e:
print(f"Error generating sample for index {idx} with voice {voice}: {e}")
continue
Expand All @@ -47,7 +49,7 @@ async def generate_samples():
async def main():
print("Starting sample generation...")
total_generated = await generate_samples()
print(f"Sample generation completed. Total samples generated: {total_generated + 1}")
print(f"Sample generation completed. Total samples generated: {total_generated}")

if __name__ == "__main__":
asyncio.run(main())
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ async def generate_variation_records(voices: list):
try:
existing_idx = existing_phrases.index(phrase)
records.append(existing_records[existing_idx])
existing_phrases.remove(existing_idx)
existing_phrases.pop(existing_idx)
continue
except ValueError:
pass # Phrase not found in existing records, proceed to create new variations
Expand Down
1 change: 0 additions & 1 deletion ml/scripts/speech_to_text/02_add_delays.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import argparse
import os
from pathlib import Path
import random
import pandas as pd
import soundfile as sf
import numpy as np
Expand Down
3 changes: 0 additions & 3 deletions ml/scripts/speech_to_text/02a_randomize_delay_variations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@
import pandas as pd
import random
import os
import asyncio
import edge_tts
from tqdm import tqdm

# Settings
# Delay frequency
Expand Down
85 changes: 43 additions & 42 deletions ml/scripts/speech_to_text/03_add_background_noise.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,52 +24,53 @@
os.makedirs(paths.output_dir, exist_ok=True)

def get_random_noise(noise_files, length, sr):
noise_file = random.choice(noise_files)
noise, noise_sr = sf.read(noise_file)
if len(noise.shape) > 1:
noise = noise[:,0] # Use first channel if stereo
if noise_sr != sr:
# Resample noise to match target sample rate
num_samples = int(len(noise) * sr / noise_sr)
indices = np.linspace(0, len(noise) - 1, num_samples).astype(int)
noise = noise[indices]
if len(noise) < length:
# Loop noise if too short
repeats = int(np.ceil(length / len(noise)))
noise = np.tile(noise, repeats)
start = random.randint(0, len(noise) - length)
return noise[start:start+length]
noise_file = random.choice(noise_files)
noise, noise_sr = sf.read(noise_file)
if len(noise.shape) > 1:
noise = noise[:,0] # Use first channel if stereo
if noise_sr != sr:
# Resample noise to match target sample rate
num_samples = int(len(noise) * sr / noise_sr)
indices = np.linspace(0, len(noise) - 1, num_samples).astype(int)
noise = noise[indices]
if len(noise) < length:
# Loop noise if too short
repeats = int(np.ceil(length / len(noise)))
noise = np.tile(noise, repeats)
max_start = max(0, len(noise) - length)
start = random.randint(0, max_start)
return noise[start:start+length]

def add_noise_to_audio(audio, noise, volume):
return audio + noise * volume
return audio + noise * volume

def main():
noise_files = list(paths.noise_dir.glob("*.wav"))
if not noise_files:
print(f"No noise samples found in {paths.noise_dir}")
return
input_files = list(paths.input_dir.glob("*.wav"))
for input_file in tqdm(input_files, desc="Processing audio files", unit="file", total=len(input_files)):
stem = input_file.stem
audio, sr = sf.read(input_file)
if len(audio.shape) > 1:
audio = audio[:,0] # Use first channel if stereo
add_noise = (random.randint(1, background_noise_frequency) == 1)
if add_noise:
noise = get_random_noise(noise_files, len(audio), sr)
volume = random.uniform(background_noise_volume_min, background_noise_volume_max)
audio_noisy = add_noise_to_audio(audio, noise, volume)
# Clip to [-1,1] to avoid overflow
audio_noisy = np.clip(audio_noisy, -1.0, 1.0)
out_audio = audio_noisy
# Modify filename to include _bg{volume} without leading '0.'
volume_str = f"{int(volume * 1000):03d}"
out_filename = f"{stem}_bg{volume_str}.wav"
else:
out_audio = audio
out_filename = input_file.name
out_path = paths.output_dir / out_filename
sf.write(out_path, out_audio, sr)
noise_files = list(paths.noise_dir.glob("*.wav"))
if not noise_files:
print(f"No noise samples found in {paths.noise_dir}")
return
input_files = list(paths.input_dir.glob("*.wav"))
for input_file in tqdm(input_files, desc="Processing audio files", unit="file", total=len(input_files)):
stem = input_file.stem
audio, sr = sf.read(input_file)
if len(audio.shape) > 1:
audio = audio[:,0] # Use first channel if stereo
add_noise = (random.randint(1, background_noise_frequency) == 1)
if add_noise:
noise = get_random_noise(noise_files, len(audio), sr)
volume = random.uniform(background_noise_volume_min, background_noise_volume_max)
audio_noisy = add_noise_to_audio(audio, noise, volume)
# Clip to [-1,1] to avoid overflow
audio_noisy = np.clip(audio_noisy, -1.0, 1.0)
out_audio = audio_noisy
# Modify filename to include _bg{volume} without leading '0.'
volume_str = f"{int(volume * 1000):03d}"
out_filename = f"{stem}_bg{volume_str}.wav"
else:
out_audio = audio
out_filename = input_file.name
out_path = paths.output_dir / out_filename
sf.write(out_path, out_audio, sr)

if __name__ == "__main__":
main()
Expand Down
4 changes: 2 additions & 2 deletions ml/scripts/speech_to_text/05_create_set_manifests.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import os
from pathlib import Path
import random
import zipfile
import csv
import re

Expand Down Expand Up @@ -37,7 +36,8 @@
file_path = Path(root) / file
all_files.append(str(file_path.resolve()))

# Shuffle the list
# Shuffle the list (seeded for reproducibility)
random.seed(42)
random.shuffle(all_files)

total_files = len(all_files)
Expand Down
4 changes: 2 additions & 2 deletions ml/scripts/speech_to_text/07_compute_spectrograms.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

def compute_melspectrogram(time_steps, wav_path):
y, sr = sf.read(str(wav_path))
# If stereo, convert to mono (average channels)
# If stereo, convert to mono (average channels)
if y.ndim > 1:
y = np.mean(y, axis=1)
S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=80)
Expand All @@ -45,7 +45,7 @@ def compute_melspectrogram(time_steps, wav_path):
return log_S

def compute_tokens(vocab_list, transcription):
transcription = re.sub(r"[^a-z0-9\s]", " ", transcription.lower())
transcription = transcription.lower().replace(',', ' ')
tokens = [vocab_list.index(word) for word in transcription.split() if word in vocab_list]
tokens = tokens + [pad_value]*(input_token_length-len(tokens)) if len(tokens)<input_token_length else tokens[:input_token_length]
return tokens
Expand Down
13 changes: 5 additions & 8 deletions ml/scripts/speech_to_text/08_train_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import argparse
import csv
from pathlib import Path
import os
import numpy as np
Expand Down Expand Up @@ -36,7 +35,8 @@
with open(paths.vocab, 'r', encoding='utf-8') as vocabfile:
vocab_list = [line.strip() for line in vocabfile if line.strip()]
num_classes = len(vocab_list) + 1 # +1 for CTC blank token
print(f"Vocabulary size: {len(vocab_list)}, Number of classes (with CTC blank): {num_classes}")
ctc_blank_idx = len(vocab_list) # CTC blank token is at the last index
print(f"Vocabulary size: {len(vocab_list)}, Number of classes (with CTC blank): {num_classes}, CTC blank index: {ctc_blank_idx}")

input_layer = Input(shape=(n_mels, time_steps), name='input')
x = layers.Reshape((n_mels, time_steps, 1))(input_layer)
Expand All @@ -58,15 +58,10 @@
model.compile(optimizer='adam', loss='categorical_crossentropy') # placeholder
print("Model compiled.")

# Custom CTC loss layer for Keras functional API
def ctc_loss_fn(y_true, y_pred, input_length, label_length):
return tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)

x_train = []
y_train = []

# Prepare input/output pairs for training
input_output_pairs = []
for _, row in tqdm(training_set.iterrows(), total=len(training_set), desc="Loading training data"):
wav_path = row['filepath']
# Get the corresponding spectrogram/tokens NPY file path
Expand All @@ -91,7 +86,9 @@ def ctc_loss_fn(y_true, y_pred, input_length, label_length):
y_pred = model(x_batch, training=True)
# Compute prediction lengths (time steps of y_pred)
pred_len = tf.fill([tf.shape(y_pred)[0], 1], tf.shape(y_pred)[1])
lbl_len_reshaped = tf.fill([tf.shape(y_batch)[0], 1], tf.shape(y_batch)[1])
# Compute true label lengths by counting non-padding tokens (assumes 0 is padding)
lbl_len = tf.math.count_nonzero(y_batch, axis=1, dtype=tf.int32)
lbl_len_reshaped = tf.expand_dims(lbl_len, axis=1)
loss = tf.keras.backend.ctc_batch_cost(y_batch, y_pred, pred_len, lbl_len_reshaped)
grads = tape.gradient(loss, model.trainable_variables)
model.optimizer.apply_gradients(zip(grads, model.trainable_variables))
Expand Down
15 changes: 8 additions & 7 deletions ml/scripts/speech_to_text/09_evaluate_model.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import argparse
import csv
from pathlib import Path
import os
import numpy as np
import pandas as pd
from tensorflow.keras import layers, Model, Input
from tqdm import tqdm
from jiwer import wer

Expand All @@ -18,12 +16,12 @@
batch_size = 32 # evaluation batch size

# Parse command-line arguments
parser = argparse.ArgumentParser(description="Train speech-to-text model.")
parser = argparse.ArgumentParser(description="Evaluate speech-to-text model.")
parser.add_argument('--manifest', type=Path, required=True, help='Path to val_manifest.csv')
parser.add_argument('--model', type=Path, required=True, help='Path to model file (speech_to_text_model.keras)')
parser.add_argument('--vocab', type=Path, required=True, help='Path to vocab_list.txt')
parser.add_argument('--spectrogram-dir', type=Path, required=True, help='Directory with spectrogram npy files')
parser.add_argument('--output-dir', type=Path, required=True, help='Directory for output model')
parser.add_argument('--output-dir', type=Path, required=True, help='Directory for evaluation results (predictions and metrics)')
paths = parser.parse_args()

os.makedirs(paths.output_dir, exist_ok=True)
Expand All @@ -35,7 +33,6 @@
# Prepare input/output pairs for evaluation
x_eval = []
y_eval = []
input_output_pairs = []
for _, row in tqdm(eval_set.iterrows(), total=len(eval_set), desc="Loading evaluation data"):
wav_path = row['filepath']
# Get the corresponding spectrogram/tokens NPY file path
Expand All @@ -54,8 +51,12 @@
# Load the vocabulary list from vocab file
with open(paths.vocab, 'r', encoding='utf-8') as vocabfile:
vocab_list = [line.strip() for line in vocabfile if line.strip()]
ctc_blank_idx = len(vocab_list) + 1 # +1 for CTC blank token
print(f"Vocabulary size: {len(vocab_list)}, Number of classes (with CTC blank): {ctc_blank_idx}")
ctc_blank_idx = len(vocab_list) # CTC blank token is conventionally the last index
print(
f"Vocabulary size: {len(vocab_list)}, "
f"Number of classes (with CTC blank): {len(vocab_list) + 1}, "
f"CTC blank index: {ctc_blank_idx}"
)

def ctc_greedy_decode(pred, blank=ctc_blank_idx):
pred_ids = np.argmax(pred, axis=-1)
Expand Down
10 changes: 3 additions & 7 deletions ml/scripts/speech_to_text/10_evaluate_test_samples.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
import argparse
import csv
from pathlib import Path
import os
import numpy as np
import pandas as pd
from tensorflow.keras import layers, Model, Input
from tqdm import tqdm
from jiwer import wer
from zipfile import ZipFile

print("Initializing TensorFlow...")
Expand All @@ -19,7 +16,7 @@
batch_size = 32 # evaluation batch size

# Parse command-line arguments
parser = argparse.ArgumentParser(description="Train speech-to-text model.")
parser = argparse.ArgumentParser(description="Evaluate test samples and create ZIP of successfully recognized files.")
parser.add_argument('--manifest', type=Path, required=True, help='Path to test_manifest.csv')
parser.add_argument('--model', type=Path, required=True, help='Path to model file (speech_to_text_model.keras)')
parser.add_argument('--vocab', type=Path, required=True, help='Path to vocab_list.txt')
Expand All @@ -37,7 +34,6 @@
# Prepare input/output pairs for evaluation
x_eval = []
y_eval = []
input_output_pairs = []
for _, row in tqdm(eval_set.iterrows(), total=len(eval_set), desc="Loading evaluation data"):
wav_path = row['filepath']
# Get the corresponding spectrogram/tokens NPY file path
Expand All @@ -56,8 +52,8 @@
# Load the vocabulary list from vocab file
with open(paths.vocab, 'r', encoding='utf-8') as vocabfile:
vocab_list = [line.strip() for line in vocabfile if line.strip()]
ctc_blank_idx = len(vocab_list) + 1 # +1 for CTC blank token
print(f"Vocabulary size: {len(vocab_list)}, Number of classes (with CTC blank): {ctc_blank_idx}")
ctc_blank_idx = len(vocab_list) # CTC blank token is conventionally at the last index
print(f"Vocabulary size: {len(vocab_list)}, Number of classes (with CTC blank): {len(vocab_list) + 1}")

def ctc_greedy_decode(pred, blank=ctc_blank_idx):
pred_ids = np.argmax(pred, axis=-1)
Expand Down