From 7b14c0cb18026988cb60e342ebe5213bb1b1e252 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 4 Feb 2026 06:22:03 +0000 Subject: [PATCH 1/3] Initial plan From 68c4f100a175c15959f2d1d9bbb1d4b7bf1dee87 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 4 Feb 2026 06:25:51 +0000 Subject: [PATCH 2/3] Address code review comments - fix critical issues and clean up code Co-authored-by: jodavis <6740581+jodavis@users.noreply.github.com> --- .../intent_prediction/01_generate_phrases.py | 3 +- ml/scripts/requirements.txt | 21 +++-- .../01_generate_speech_samples.py | 6 +- .../01a_generate_speech_sample_variations.py | 2 +- ml/scripts/speech_to_text/02_add_delays.py | 1 - .../02a_randomize_delay_variations.py | 3 - .../speech_to_text/03_add_background_noise.py | 85 ++++++++++--------- .../speech_to_text/05_create_set_manifests.py | 4 +- .../speech_to_text/07_compute_spectrograms.py | 4 +- ml/scripts/speech_to_text/08_train_model.py | 13 ++- .../speech_to_text/09_evaluate_model.py | 15 ++-- .../10_evaluate_test_samples.py | 10 +-- 12 files changed, 79 insertions(+), 88 deletions(-) diff --git a/ml/scripts/intent_prediction/01_generate_phrases.py b/ml/scripts/intent_prediction/01_generate_phrases.py index 82aa8d2..e8af56a 100644 --- a/ml/scripts/intent_prediction/01_generate_phrases.py +++ b/ml/scripts/intent_prediction/01_generate_phrases.py @@ -6,7 +6,6 @@ for fine-tuning an LLM to handle remote control commands as a fallback. """ -import xml.etree.ElementTree as ET import csv import random import re @@ -95,7 +94,7 @@ def __init__(self, target_samples: int): self.target_samples = target_samples (self.generated, self.existing_variations) = self.load_existing_variations(OUTPUT_FILE) - def load_existing_variations(self, filepath: Path) -> (set | List[Dict[str, str]]): + def load_existing_variations(self, filepath: Path) -> Tuple[set, List[Dict[str, str]]]: """Load existing variations from a CSV file to avoid duplicates.""" variations = [] generated_keys = set() diff --git a/ml/scripts/requirements.txt b/ml/scripts/requirements.txt index e5ffe12..6569a61 100644 --- a/ml/scripts/requirements.txt +++ b/ml/scripts/requirements.txt @@ -1,11 +1,10 @@ -edge-tts -pandas -tqdm -pydub -soundfile -librosa -tensorflow -tensorflow.keras -onnx -tf2onnx -jiwer \ No newline at end of file +edge-tts==6.1.9 +pandas==2.2.0 +tqdm==4.66.1 +pydub==0.25.1 +soundfile==0.12.1 +librosa==0.10.1 +tensorflow==2.15.0 +onnx==1.15.0 +tf2onnx==1.16.1 +jiwer==3.0.3 \ No newline at end of file diff --git a/ml/scripts/speech_to_text/01_generate_speech_samples.py b/ml/scripts/speech_to_text/01_generate_speech_samples.py index 24d96e1..6251113 100644 --- a/ml/scripts/speech_to_text/01_generate_speech_samples.py +++ b/ml/scripts/speech_to_text/01_generate_speech_samples.py @@ -25,7 +25,8 @@ # Functions async def generate_samples(): - for count, idx in enumerate(tqdm(subsampled_indices, desc="Generating speech samples")): + count = 0 + for idx in tqdm(subsampled_indices, desc="Generating speech samples"): output_path = paths.output_dir / phrases_df.iloc[idx]['sample_file_name'] if output_path.exists(): continue # Skip if samples already exist for this phrase index @@ -39,6 +40,7 @@ async def generate_samples(): rate=speech_rate_str, ) await communicate.save(str(output_path)) + count += 1 except Exception as e: print(f"Error generating sample for index {idx} with voice {voice}: {e}") continue @@ -47,7 +49,7 @@ async def generate_samples(): async def main(): print("Starting sample generation...") total_generated = await generate_samples() - print(f"Sample generation completed. Total samples generated: {total_generated + 1}") + print(f"Sample generation completed. Total samples generated: {total_generated}") if __name__ == "__main__": asyncio.run(main()) \ No newline at end of file diff --git a/ml/scripts/speech_to_text/01a_generate_speech_sample_variations.py b/ml/scripts/speech_to_text/01a_generate_speech_sample_variations.py index 71836c6..5b79f50 100644 --- a/ml/scripts/speech_to_text/01a_generate_speech_sample_variations.py +++ b/ml/scripts/speech_to_text/01a_generate_speech_sample_variations.py @@ -62,7 +62,7 @@ async def generate_variation_records(voices: list): try: existing_idx = existing_phrases.index(phrase) records.append(existing_records[existing_idx]) - existing_phrases.remove(existing_idx) + existing_phrases.pop(existing_idx) continue except ValueError: pass # Phrase not found in existing records, proceed to create new variations diff --git a/ml/scripts/speech_to_text/02_add_delays.py b/ml/scripts/speech_to_text/02_add_delays.py index 26051e9..0ca1bb9 100644 --- a/ml/scripts/speech_to_text/02_add_delays.py +++ b/ml/scripts/speech_to_text/02_add_delays.py @@ -1,7 +1,6 @@ import argparse import os from pathlib import Path -import random import pandas as pd import soundfile as sf import numpy as np diff --git a/ml/scripts/speech_to_text/02a_randomize_delay_variations.py b/ml/scripts/speech_to_text/02a_randomize_delay_variations.py index c6ae872..f1cff0d 100644 --- a/ml/scripts/speech_to_text/02a_randomize_delay_variations.py +++ b/ml/scripts/speech_to_text/02a_randomize_delay_variations.py @@ -3,9 +3,6 @@ import pandas as pd import random import os -import asyncio -import edge_tts -from tqdm import tqdm # Settings # Delay frequency diff --git a/ml/scripts/speech_to_text/03_add_background_noise.py b/ml/scripts/speech_to_text/03_add_background_noise.py index f39361f..10f794e 100644 --- a/ml/scripts/speech_to_text/03_add_background_noise.py +++ b/ml/scripts/speech_to_text/03_add_background_noise.py @@ -24,52 +24,53 @@ os.makedirs(paths.output_dir, exist_ok=True) def get_random_noise(noise_files, length, sr): - noise_file = random.choice(noise_files) - noise, noise_sr = sf.read(noise_file) - if len(noise.shape) > 1: - noise = noise[:,0] # Use first channel if stereo - if noise_sr != sr: - # Resample noise to match target sample rate - num_samples = int(len(noise) * sr / noise_sr) - indices = np.linspace(0, len(noise) - 1, num_samples).astype(int) - noise = noise[indices] - if len(noise) < length: - # Loop noise if too short - repeats = int(np.ceil(length / len(noise))) - noise = np.tile(noise, repeats) - start = random.randint(0, len(noise) - length) - return noise[start:start+length] + noise_file = random.choice(noise_files) + noise, noise_sr = sf.read(noise_file) + if len(noise.shape) > 1: + noise = noise[:,0] # Use first channel if stereo + if noise_sr != sr: + # Resample noise to match target sample rate + num_samples = int(len(noise) * sr / noise_sr) + indices = np.linspace(0, len(noise) - 1, num_samples).astype(int) + noise = noise[indices] + if len(noise) < length: + # Loop noise if too short + repeats = int(np.ceil(length / len(noise))) + noise = np.tile(noise, repeats) + max_start = max(0, len(noise) - length) + start = random.randint(0, max_start) + return noise[start:start+length] def add_noise_to_audio(audio, noise, volume): - return audio + noise * volume + return audio + noise * volume def main(): - noise_files = list(paths.noise_dir.glob("*.wav")) - if not noise_files: - print(f"No noise samples found in {paths.noise_dir}") - return - input_files = list(paths.input_dir.glob("*.wav")) - for input_file in tqdm(input_files, desc="Processing audio files", unit="file", total=len(input_files)): - stem = input_file.stem - audio, sr = sf.read(input_file) - if len(audio.shape) > 1: - audio = audio[:,0] # Use first channel if stereo - add_noise = (random.randint(1, background_noise_frequency) == 1) - if add_noise: - noise = get_random_noise(noise_files, len(audio), sr) - volume = random.uniform(background_noise_volume_min, background_noise_volume_max) - audio_noisy = add_noise_to_audio(audio, noise, volume) - # Clip to [-1,1] to avoid overflow - audio_noisy = np.clip(audio_noisy, -1.0, 1.0) - out_audio = audio_noisy - # Modify filename to include _bg{volume} without leading '0.' - volume_str = f"{int(volume * 1000):03d}" - out_filename = f"{stem}_bg{volume_str}.wav" - else: - out_audio = audio - out_filename = input_file.name - out_path = paths.output_dir / out_filename - sf.write(out_path, out_audio, sr) + noise_files = list(paths.noise_dir.glob("*.wav")) + if not noise_files: + print(f"No noise samples found in {paths.noise_dir}") + return + input_files = list(paths.input_dir.glob("*.wav")) + for input_file in tqdm(input_files, desc="Processing audio files", unit="file", total=len(input_files)): + stem = input_file.stem + audio, sr = sf.read(input_file) + if len(audio.shape) > 1: + audio = audio[:,0] # Use first channel if stereo + add_noise = (random.randint(1, background_noise_frequency) == 1) + if add_noise: + noise = get_random_noise(noise_files, len(audio), sr) + volume = random.uniform(background_noise_volume_min, background_noise_volume_max) + audio_noisy = add_noise_to_audio(audio, noise, volume) + # Clip to [-1,1] to avoid overflow + audio_noisy = np.clip(audio_noisy, -1.0, 1.0) + out_audio = audio_noisy + # Modify filename to include _bg{volume} without leading '0.' + volume_str = f"{int(volume * 1000):03d}" + out_filename = f"{stem}_bg{volume_str}.wav" + else: + out_audio = audio + out_filename = input_file.name + out_path = paths.output_dir / out_filename + sf.write(out_path, out_audio, sr) if __name__ == "__main__": main() diff --git a/ml/scripts/speech_to_text/05_create_set_manifests.py b/ml/scripts/speech_to_text/05_create_set_manifests.py index 53e745b..8b0a4f0 100644 --- a/ml/scripts/speech_to_text/05_create_set_manifests.py +++ b/ml/scripts/speech_to_text/05_create_set_manifests.py @@ -3,7 +3,6 @@ import os from pathlib import Path import random -import zipfile import csv import re @@ -37,7 +36,8 @@ file_path = Path(root) / file all_files.append(str(file_path.resolve())) -# Shuffle the list +# Shuffle the list (seeded for reproducibility) +random.seed(42) random.shuffle(all_files) total_files = len(all_files) diff --git a/ml/scripts/speech_to_text/07_compute_spectrograms.py b/ml/scripts/speech_to_text/07_compute_spectrograms.py index ab9d08c..b7e1012 100644 --- a/ml/scripts/speech_to_text/07_compute_spectrograms.py +++ b/ml/scripts/speech_to_text/07_compute_spectrograms.py @@ -31,7 +31,7 @@ def compute_melspectrogram(time_steps, wav_path): y, sr = sf.read(str(wav_path)) - # If stereo, convert to mono (average channels) + # If stereo, convert to mono (average channels) if y.ndim > 1: y = np.mean(y, axis=1) S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=80) @@ -45,7 +45,7 @@ def compute_melspectrogram(time_steps, wav_path): return log_S def compute_tokens(vocab_list, transcription): - transcription = re.sub(r"[^a-z0-9\s]", " ", transcription.lower()) + transcription = transcription.lower().replace(',', ' ') tokens = [vocab_list.index(word) for word in transcription.split() if word in vocab_list] tokens = tokens + [pad_value]*(input_token_length-len(tokens)) if len(tokens) Date: Wed, 4 Feb 2026 06:26:56 +0000 Subject: [PATCH 3/3] Update onnx to 1.17.0 to fix security vulnerabilities Co-authored-by: jodavis <6740581+jodavis@users.noreply.github.com> --- ml/scripts/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml/scripts/requirements.txt b/ml/scripts/requirements.txt index 6569a61..2f94239 100644 --- a/ml/scripts/requirements.txt +++ b/ml/scripts/requirements.txt @@ -5,6 +5,6 @@ pydub==0.25.1 soundfile==0.12.1 librosa==0.10.1 tensorflow==2.15.0 -onnx==1.15.0 +onnx==1.17.0 tf2onnx==1.16.1 jiwer==3.0.3 \ No newline at end of file