jodavis · jodavis · Feb 5, 2026 · Feb 4, 2026 · Feb 4, 2026 · Feb 4, 2026
diff --git a/ml/scripts/intent_prediction/01_generate_phrases.py b/ml/scripts/intent_prediction/01_generate_phrases.py
@@ -6,7 +6,6 @@
 for fine-tuning an LLM to handle remote control commands as a fallback.
 """
 
-import xml.etree.ElementTree as ET
 import csv
 import random
 import re
@@ -95,7 +94,7 @@ def __init__(self, target_samples: int):
         self.target_samples = target_samples
         (self.generated, self.existing_variations) = self.load_existing_variations(OUTPUT_FILE)
 
-    def load_existing_variations(self, filepath: Path) -> (set | List[Dict[str, str]]):
+    def load_existing_variations(self, filepath: Path) -> Tuple[set, List[Dict[str, str]]]:
         """Load existing variations from a CSV file to avoid duplicates."""
         variations = []
         generated_keys = set()

diff --git a/ml/scripts/requirements.txt b/ml/scripts/requirements.txt
@@ -1,11 +1,10 @@
-edge-tts
-pandas
-tqdm
-pydub
-soundfile
-librosa
-tensorflow
-tensorflow.keras
-onnx
-tf2onnx
-jiwer
+edge-tts==6.1.9
+pandas==2.2.0
+tqdm==4.66.1
+pydub==0.25.1
+soundfile==0.12.1
+librosa==0.10.1
+tensorflow==2.15.0
+onnx==1.17.0
+tf2onnx==1.16.1
+jiwer==3.0.3
diff --git a/ml/scripts/speech_to_text/01_generate_speech_samples.py b/ml/scripts/speech_to_text/01_generate_speech_samples.py
@@ -25,7 +25,8 @@
 
 # Functions
 async def generate_samples():
-    for count, idx in enumerate(tqdm(subsampled_indices, desc="Generating speech samples")):
+    count = 0
+    for idx in tqdm(subsampled_indices, desc="Generating speech samples"):
         output_path = paths.output_dir / phrases_df.iloc[idx]['sample_file_name']
         if output_path.exists():
             continue  # Skip if samples already exist for this phrase index
@@ -39,6 +40,7 @@ async def generate_samples():
                 rate=speech_rate_str,
             )
             await communicate.save(str(output_path))
+            count += 1
         except Exception as e:
             print(f"Error generating sample for index {idx} with voice {voice}: {e}")
             continue
@@ -47,7 +49,7 @@ async def generate_samples():
 async def main():
     print("Starting sample generation...")
     total_generated = await generate_samples()
-    print(f"Sample generation completed. Total samples generated: {total_generated + 1}")
+    print(f"Sample generation completed. Total samples generated: {total_generated}")
 
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/ml/scripts/speech_to_text/01a_generate_speech_sample_variations.py b/ml/scripts/speech_to_text/01a_generate_speech_sample_variations.py
@@ -62,7 +62,7 @@ async def generate_variation_records(voices: list):
         try:
             existing_idx = existing_phrases.index(phrase)
             records.append(existing_records[existing_idx])
-            existing_phrases.remove(existing_idx)
+            existing_phrases.pop(existing_idx)
             continue
         except ValueError:
             pass  # Phrase not found in existing records, proceed to create new variations

diff --git a/ml/scripts/speech_to_text/02_add_delays.py b/ml/scripts/speech_to_text/02_add_delays.py
@@ -1,7 +1,6 @@
 import argparse
 import os
 from pathlib import Path
-import random
 import pandas as pd
 import soundfile as sf
 import numpy as np

diff --git a/ml/scripts/speech_to_text/02a_randomize_delay_variations.py b/ml/scripts/speech_to_text/02a_randomize_delay_variations.py
@@ -3,9 +3,6 @@
 import pandas as pd
 import random
 import os
-import asyncio
-import edge_tts
-from tqdm import tqdm
 
 # Settings
 # Delay frequency

diff --git a/ml/scripts/speech_to_text/03_add_background_noise.py b/ml/scripts/speech_to_text/03_add_background_noise.py
@@ -24,52 +24,53 @@
 os.makedirs(paths.output_dir, exist_ok=True)
 
 def get_random_noise(noise_files, length, sr):
-	noise_file = random.choice(noise_files)
-	noise, noise_sr = sf.read(noise_file)
-	if len(noise.shape) > 1:
-		noise = noise[:,0]  # Use first channel if stereo
-	if noise_sr != sr:
-		# Resample noise to match target sample rate
-		num_samples = int(len(noise) * sr / noise_sr)
-		indices = np.linspace(0, len(noise) - 1, num_samples).astype(int)
-		noise = noise[indices]
-	if len(noise) < length:
-		# Loop noise if too short
-		repeats = int(np.ceil(length / len(noise)))
-		noise = np.tile(noise, repeats)
-	start = random.randint(0, len(noise) - length)
-	return noise[start:start+length]
+    noise_file = random.choice(noise_files)
+    noise, noise_sr = sf.read(noise_file)
+    if len(noise.shape) > 1:
+        noise = noise[:,0]  # Use first channel if stereo
+    if noise_sr != sr:
+        # Resample noise to match target sample rate
+        num_samples = int(len(noise) * sr / noise_sr)
+        indices = np.linspace(0, len(noise) - 1, num_samples).astype(int)
+        noise = noise[indices]
+    if len(noise) < length:
+        # Loop noise if too short
+        repeats = int(np.ceil(length / len(noise)))
+        noise = np.tile(noise, repeats)
+    max_start = max(0, len(noise) - length)
+    start = random.randint(0, max_start)
+    return noise[start:start+length]
 
 def add_noise_to_audio(audio, noise, volume):
-	return audio + noise * volume
+    return audio + noise * volume
 
 def main():
-	noise_files = list(paths.noise_dir.glob("*.wav"))
-	if not noise_files:
-		print(f"No noise samples found in {paths.noise_dir}")
-		return
-	input_files = list(paths.input_dir.glob("*.wav"))
-	for input_file in tqdm(input_files, desc="Processing audio files", unit="file", total=len(input_files)):
-		stem = input_file.stem
-		audio, sr = sf.read(input_file)
-		if len(audio.shape) > 1:
-			audio = audio[:,0]  # Use first channel if stereo
-		add_noise = (random.randint(1, background_noise_frequency) == 1)
-		if add_noise:
-			noise = get_random_noise(noise_files, len(audio), sr)
-			volume = random.uniform(background_noise_volume_min, background_noise_volume_max)
-			audio_noisy = add_noise_to_audio(audio, noise, volume)
-			# Clip to [-1,1] to avoid overflow
-			audio_noisy = np.clip(audio_noisy, -1.0, 1.0)
-			out_audio = audio_noisy
-			# Modify filename to include _bg{volume} without leading '0.'
-			volume_str = f"{int(volume * 1000):03d}"
-			out_filename = f"{stem}_bg{volume_str}.wav"
-		else:
-			out_audio = audio
-			out_filename = input_file.name
-		out_path = paths.output_dir / out_filename
-		sf.write(out_path, out_audio, sr)
+    noise_files = list(paths.noise_dir.glob("*.wav"))
+    if not noise_files:
+        print(f"No noise samples found in {paths.noise_dir}")
+        return
+    input_files = list(paths.input_dir.glob("*.wav"))
+    for input_file in tqdm(input_files, desc="Processing audio files", unit="file", total=len(input_files)):
+        stem = input_file.stem
+        audio, sr = sf.read(input_file)
+        if len(audio.shape) > 1:
+            audio = audio[:,0]  # Use first channel if stereo
+        add_noise = (random.randint(1, background_noise_frequency) == 1)
+        if add_noise:
+            noise = get_random_noise(noise_files, len(audio), sr)
+            volume = random.uniform(background_noise_volume_min, background_noise_volume_max)
+            audio_noisy = add_noise_to_audio(audio, noise, volume)
+            # Clip to [-1,1] to avoid overflow
+            audio_noisy = np.clip(audio_noisy, -1.0, 1.0)
+            out_audio = audio_noisy
+            # Modify filename to include _bg{volume} without leading '0.'
+            volume_str = f"{int(volume * 1000):03d}"
+            out_filename = f"{stem}_bg{volume_str}.wav"
+        else:
+            out_audio = audio
+            out_filename = input_file.name
+        out_path = paths.output_dir / out_filename
+        sf.write(out_path, out_audio, sr)
 
 if __name__ == "__main__":
 	main()

diff --git a/ml/scripts/speech_to_text/05_create_set_manifests.py b/ml/scripts/speech_to_text/05_create_set_manifests.py
@@ -3,7 +3,6 @@
 import os
 from pathlib import Path
 import random
-import zipfile
 import csv
 import re
 
@@ -37,7 +36,8 @@
             file_path = Path(root) / file
             all_files.append(str(file_path.resolve()))
 
-# Shuffle the list
+# Shuffle the list (seeded for reproducibility)
+random.seed(42)
 random.shuffle(all_files)
 
 total_files = len(all_files)

diff --git a/ml/scripts/speech_to_text/07_compute_spectrograms.py b/ml/scripts/speech_to_text/07_compute_spectrograms.py
@@ -31,7 +31,7 @@
 
 def compute_melspectrogram(time_steps, wav_path):
     y, sr = sf.read(str(wav_path))
-            # If stereo, convert to mono (average channels)
+    # If stereo, convert to mono (average channels)
     if y.ndim > 1:
         y = np.mean(y, axis=1)
     S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=80)
@@ -45,7 +45,7 @@ def compute_melspectrogram(time_steps, wav_path):
     return log_S
 
 def compute_tokens(vocab_list, transcription):
-    transcription = re.sub(r"[^a-z0-9\s]", " ", transcription.lower())
+    transcription = transcription.lower().replace(',', ' ')
     tokens = [vocab_list.index(word) for word in transcription.split() if word in vocab_list]
     tokens = tokens + [pad_value]*(input_token_length-len(tokens)) if len(tokens)<input_token_length else tokens[:input_token_length]
     return tokens

diff --git a/ml/scripts/speech_to_text/08_train_model.py b/ml/scripts/speech_to_text/08_train_model.py
@@ -1,5 +1,4 @@
 import argparse
-import csv
 from pathlib import Path
 import os
 import numpy as np
@@ -36,7 +35,8 @@
 with open(paths.vocab, 'r', encoding='utf-8') as vocabfile:
     vocab_list = [line.strip() for line in vocabfile if line.strip()]
     num_classes = len(vocab_list) + 1  # +1 for CTC blank token
-    print(f"Vocabulary size: {len(vocab_list)}, Number of classes (with CTC blank): {num_classes}")
+    ctc_blank_idx = len(vocab_list)  # CTC blank token is at the last index
+    print(f"Vocabulary size: {len(vocab_list)}, Number of classes (with CTC blank): {num_classes}, CTC blank index: {ctc_blank_idx}")
 
 input_layer = Input(shape=(n_mels, time_steps), name='input')
 x = layers.Reshape((n_mels, time_steps, 1))(input_layer)
@@ -58,15 +58,10 @@
 model.compile(optimizer='adam', loss='categorical_crossentropy')  # placeholder
 print("Model compiled.")
 
-# Custom CTC loss layer for Keras functional API
-def ctc_loss_fn(y_true, y_pred, input_length, label_length):
-    return tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
-
 x_train = []
 y_train = []
 
 # Prepare input/output pairs for training
-input_output_pairs = []
 for _, row in tqdm(training_set.iterrows(), total=len(training_set), desc="Loading training data"):
     wav_path = row['filepath']
     # Get the corresponding spectrogram/tokens NPY file path
@@ -91,7 +86,9 @@ def ctc_loss_fn(y_true, y_pred, input_length, label_length):
             y_pred = model(x_batch, training=True)
             # Compute prediction lengths (time steps of y_pred)
             pred_len = tf.fill([tf.shape(y_pred)[0], 1], tf.shape(y_pred)[1])
-            lbl_len_reshaped = tf.fill([tf.shape(y_batch)[0], 1], tf.shape(y_batch)[1])
+            # Compute true label lengths by counting non-padding tokens (assumes 0 is padding)
+            lbl_len = tf.math.count_nonzero(y_batch, axis=1, dtype=tf.int32)
+            lbl_len_reshaped = tf.expand_dims(lbl_len, axis=1)
             loss = tf.keras.backend.ctc_batch_cost(y_batch, y_pred, pred_len, lbl_len_reshaped)
         grads = tape.gradient(loss, model.trainable_variables)
         model.optimizer.apply_gradients(zip(grads, model.trainable_variables))

diff --git a/ml/scripts/speech_to_text/09_evaluate_model.py b/ml/scripts/speech_to_text/09_evaluate_model.py
@@ -1,10 +1,8 @@
 import argparse
-import csv
 from pathlib import Path
 import os
 import numpy as np
 import pandas as pd
-from tensorflow.keras import layers, Model, Input
 from tqdm import tqdm
 from jiwer import wer
 
@@ -18,12 +16,12 @@
 batch_size = 32 # evaluation batch size
 
 # Parse command-line arguments
-parser = argparse.ArgumentParser(description="Train speech-to-text model.")
+parser = argparse.ArgumentParser(description="Evaluate speech-to-text model.")
 parser.add_argument('--manifest', type=Path, required=True, help='Path to val_manifest.csv')
 parser.add_argument('--model', type=Path, required=True, help='Path to model file (speech_to_text_model.keras)')
 parser.add_argument('--vocab', type=Path, required=True, help='Path to vocab_list.txt')
 parser.add_argument('--spectrogram-dir', type=Path, required=True, help='Directory with spectrogram npy files')
-parser.add_argument('--output-dir', type=Path, required=True, help='Directory for output model')
+parser.add_argument('--output-dir', type=Path, required=True, help='Directory for evaluation results (predictions and metrics)')
 paths = parser.parse_args()
 
 os.makedirs(paths.output_dir, exist_ok=True)
@@ -35,7 +33,6 @@
 # Prepare input/output pairs for evaluation
 x_eval = []
 y_eval = []
-input_output_pairs = []
 for _, row in tqdm(eval_set.iterrows(), total=len(eval_set), desc="Loading evaluation data"):
     wav_path = row['filepath']
     # Get the corresponding spectrogram/tokens NPY file path
@@ -54,8 +51,12 @@
 # Load the vocabulary list from vocab file
 with open(paths.vocab, 'r', encoding='utf-8') as vocabfile:
     vocab_list = [line.strip() for line in vocabfile if line.strip()]
-    ctc_blank_idx = len(vocab_list) + 1  # +1 for CTC blank token
-    print(f"Vocabulary size: {len(vocab_list)}, Number of classes (with CTC blank): {ctc_blank_idx}")
+    ctc_blank_idx = len(vocab_list)  # CTC blank token is conventionally the last index
+    print(
+        f"Vocabulary size: {len(vocab_list)}, "
+        f"Number of classes (with CTC blank): {len(vocab_list) + 1}, "
+        f"CTC blank index: {ctc_blank_idx}"
+    )
 
 def ctc_greedy_decode(pred, blank=ctc_blank_idx):
     pred_ids = np.argmax(pred, axis=-1)

diff --git a/ml/scripts/speech_to_text/10_evaluate_test_samples.py b/ml/scripts/speech_to_text/10_evaluate_test_samples.py
@@ -1,12 +1,9 @@
 import argparse
-import csv
 from pathlib import Path
 import os
 import numpy as np
 import pandas as pd
-from tensorflow.keras import layers, Model, Input
 from tqdm import tqdm
-from jiwer import wer
 from zipfile import ZipFile
 
 print("Initializing TensorFlow...")
@@ -19,7 +16,7 @@
 batch_size = 32 # evaluation batch size
 
 # Parse command-line arguments
-parser = argparse.ArgumentParser(description="Train speech-to-text model.")
+parser = argparse.ArgumentParser(description="Evaluate test samples and create ZIP of successfully recognized files.")
 parser.add_argument('--manifest', type=Path, required=True, help='Path to test_manifest.csv')
 parser.add_argument('--model', type=Path, required=True, help='Path to model file (speech_to_text_model.keras)')
 parser.add_argument('--vocab', type=Path, required=True, help='Path to vocab_list.txt')
@@ -37,7 +34,6 @@
 # Prepare input/output pairs for evaluation
 x_eval = []
 y_eval = []
-input_output_pairs = []
 for _, row in tqdm(eval_set.iterrows(), total=len(eval_set), desc="Loading evaluation data"):
     wav_path = row['filepath']
     # Get the corresponding spectrogram/tokens NPY file path
@@ -56,8 +52,8 @@
 # Load the vocabulary list from vocab file
 with open(paths.vocab, 'r', encoding='utf-8') as vocabfile:
     vocab_list = [line.strip() for line in vocabfile if line.strip()]
-    ctc_blank_idx = len(vocab_list) + 1  # +1 for CTC blank token
-    print(f"Vocabulary size: {len(vocab_list)}, Number of classes (with CTC blank): {ctc_blank_idx}")
+    ctc_blank_idx = len(vocab_list)  # CTC blank token is conventionally at the last index
+    print(f"Vocabulary size: {len(vocab_list)}, Number of classes (with CTC blank): {len(vocab_list) + 1}")
 
 def ctc_greedy_decode(pred, blank=ctc_blank_idx):
     pred_ids = np.argmax(pred, axis=-1)