Skip to content

API Reference

live_audio_capture

AudioNoiseReduction

A utility class for audio processing tasks such as noise reduction, filtering, and resampling.

Source code in live_audio_capture\audio_noise_reduction.py
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
class AudioNoiseReduction:
    """
    A utility class for audio processing tasks such as noise reduction, filtering, and resampling.
    """

    @staticmethod
    def apply_noise_reduction(
        audio_chunk: np.ndarray,
        sampling_rate: int,
        stationary: bool = False,
        prop_decrease: float = 1.0,
        n_std_thresh_stationary: float = 1.5,
        n_fft: int = 1024,
        win_length: int = None,
        hop_length: int = None,
        n_jobs: int = 1,  # Number of parallel jobs
        use_torch: bool = False,  # Use PyTorch for spectral gating
        device: str = "cuda",  # Device for PyTorch computation
    ) -> np.ndarray:
        """
        Apply noise reduction using the noisereduce package.

        Args:
            audio_chunk (np.ndarray): The audio chunk to process.
            sampling_rate (int): The sample rate of the audio.
            stationary (bool): Whether to perform stationary noise reduction.
            prop_decrease (float): Proportion to reduce noise by (1.0 = 100%).
            n_std_thresh_stationary (float): Number of standard deviations above mean for thresholding.
            n_fft (int): FFT window size.
            win_length (int): Window length for STFT.
            hop_length (int): Hop length for STFT.
            n_jobs (int): Number of parallel jobs to run. Set to -1 to use all CPU cores.
            use_torch (bool): Whether to use the PyTorch version of spectral gating.
            device (str): Device to run the PyTorch spectral gating on (e.g., "cuda" or "cpu").

        Returns:
            np.ndarray: The processed audio chunk with reduced noise.
        """
        # Apply noise reduction using noisereduce
        reduced_noise = nr.reduce_noise(
            y=audio_chunk,
            sr=sampling_rate,
            stationary=stationary,
            prop_decrease=prop_decrease,
            n_std_thresh_stationary=n_std_thresh_stationary,
            n_fft=n_fft,
            win_length=win_length,
            hop_length=hop_length,
            n_jobs=n_jobs,  # Pass the number of parallel jobs
            use_torch=use_torch,  # Enable/disable PyTorch
            device=device,  # Specify the device for PyTorch
        )
        return reduced_noise

    @staticmethod
    def apply_low_pass_filter(
        audio_chunk: np.ndarray,
        sampling_rate: int,
        cutoff_freq: float = 7900.0,  # Less than Nyquist frequency (8000 Hz)
    ) -> np.ndarray:
        """
        Apply a low-pass filter to the audio chunk.

        Args:
            audio_chunk (np.ndarray): The audio chunk to process.
            sampling_rate (int): The sample rate of the audio.
            cutoff_freq (float): The cutoff frequency for the low-pass filter.

        Returns:
            np.ndarray: The filtered audio chunk.
        """
        # Normalize the cutoff frequency to the range [0, 1]
        nyquist = 0.5 * sampling_rate
        if cutoff_freq >= nyquist:
            raise ValueError(
                f"Cutoff frequency must be less than the Nyquist frequency ({nyquist} Hz). "
                f"Provided cutoff frequency: {cutoff_freq} Hz."
            )
        normal_cutoff = cutoff_freq / nyquist

        # Design the Butterworth filter
        b, a = butter(5, normal_cutoff, btype="low", analog=False)

        # Apply the filter to the audio chunk
        return lfilter(b, a, audio_chunk)

    @staticmethod
    def resample_audio(
        audio_chunk: np.ndarray,
        original_rate: int,
        target_rate: int,
    ) -> np.ndarray:
        """
        Resample the audio chunk to a target sample rate.

        Args:
            audio_chunk (np.ndarray): The audio chunk to resample.
            original_rate (int): The original sample rate.
            target_rate (int): The target sample rate.

        Returns:
            np.ndarray: The resampled audio chunk.
        """
        num_samples = int(len(audio_chunk) * target_rate / original_rate)
        return resample(audio_chunk, num_samples)

apply_low_pass_filter(audio_chunk, sampling_rate, cutoff_freq=7900.0) staticmethod

Apply a low-pass filter to the audio chunk.

Parameters:

Name Type Description Default
audio_chunk ndarray

The audio chunk to process.

required
sampling_rate int

The sample rate of the audio.

required
cutoff_freq float

The cutoff frequency for the low-pass filter.

7900.0

Returns:

Type Description
ndarray

np.ndarray: The filtered audio chunk.

Source code in live_audio_capture\audio_noise_reduction.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
@staticmethod
def apply_low_pass_filter(
    audio_chunk: np.ndarray,
    sampling_rate: int,
    cutoff_freq: float = 7900.0,  # Less than Nyquist frequency (8000 Hz)
) -> np.ndarray:
    """
    Apply a low-pass filter to the audio chunk.

    Args:
        audio_chunk (np.ndarray): The audio chunk to process.
        sampling_rate (int): The sample rate of the audio.
        cutoff_freq (float): The cutoff frequency for the low-pass filter.

    Returns:
        np.ndarray: The filtered audio chunk.
    """
    # Normalize the cutoff frequency to the range [0, 1]
    nyquist = 0.5 * sampling_rate
    if cutoff_freq >= nyquist:
        raise ValueError(
            f"Cutoff frequency must be less than the Nyquist frequency ({nyquist} Hz). "
            f"Provided cutoff frequency: {cutoff_freq} Hz."
        )
    normal_cutoff = cutoff_freq / nyquist

    # Design the Butterworth filter
    b, a = butter(5, normal_cutoff, btype="low", analog=False)

    # Apply the filter to the audio chunk
    return lfilter(b, a, audio_chunk)

apply_noise_reduction(audio_chunk, sampling_rate, stationary=False, prop_decrease=1.0, n_std_thresh_stationary=1.5, n_fft=1024, win_length=None, hop_length=None, n_jobs=1, use_torch=False, device='cuda') staticmethod

Apply noise reduction using the noisereduce package.

Parameters:

Name Type Description Default
audio_chunk ndarray

The audio chunk to process.

required
sampling_rate int

The sample rate of the audio.

required
stationary bool

Whether to perform stationary noise reduction.

False
prop_decrease float

Proportion to reduce noise by (1.0 = 100%).

1.0
n_std_thresh_stationary float

Number of standard deviations above mean for thresholding.

1.5
n_fft int

FFT window size.

1024
win_length int

Window length for STFT.

None
hop_length int

Hop length for STFT.

None
n_jobs int

Number of parallel jobs to run. Set to -1 to use all CPU cores.

1
use_torch bool

Whether to use the PyTorch version of spectral gating.

False
device str

Device to run the PyTorch spectral gating on (e.g., "cuda" or "cpu").

'cuda'

Returns:

Type Description
ndarray

np.ndarray: The processed audio chunk with reduced noise.

Source code in live_audio_capture\audio_noise_reduction.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
@staticmethod
def apply_noise_reduction(
    audio_chunk: np.ndarray,
    sampling_rate: int,
    stationary: bool = False,
    prop_decrease: float = 1.0,
    n_std_thresh_stationary: float = 1.5,
    n_fft: int = 1024,
    win_length: int = None,
    hop_length: int = None,
    n_jobs: int = 1,  # Number of parallel jobs
    use_torch: bool = False,  # Use PyTorch for spectral gating
    device: str = "cuda",  # Device for PyTorch computation
) -> np.ndarray:
    """
    Apply noise reduction using the noisereduce package.

    Args:
        audio_chunk (np.ndarray): The audio chunk to process.
        sampling_rate (int): The sample rate of the audio.
        stationary (bool): Whether to perform stationary noise reduction.
        prop_decrease (float): Proportion to reduce noise by (1.0 = 100%).
        n_std_thresh_stationary (float): Number of standard deviations above mean for thresholding.
        n_fft (int): FFT window size.
        win_length (int): Window length for STFT.
        hop_length (int): Hop length for STFT.
        n_jobs (int): Number of parallel jobs to run. Set to -1 to use all CPU cores.
        use_torch (bool): Whether to use the PyTorch version of spectral gating.
        device (str): Device to run the PyTorch spectral gating on (e.g., "cuda" or "cpu").

    Returns:
        np.ndarray: The processed audio chunk with reduced noise.
    """
    # Apply noise reduction using noisereduce
    reduced_noise = nr.reduce_noise(
        y=audio_chunk,
        sr=sampling_rate,
        stationary=stationary,
        prop_decrease=prop_decrease,
        n_std_thresh_stationary=n_std_thresh_stationary,
        n_fft=n_fft,
        win_length=win_length,
        hop_length=hop_length,
        n_jobs=n_jobs,  # Pass the number of parallel jobs
        use_torch=use_torch,  # Enable/disable PyTorch
        device=device,  # Specify the device for PyTorch
    )
    return reduced_noise

resample_audio(audio_chunk, original_rate, target_rate) staticmethod

Resample the audio chunk to a target sample rate.

Parameters:

Name Type Description Default
audio_chunk ndarray

The audio chunk to resample.

required
original_rate int

The original sample rate.

required
target_rate int

The target sample rate.

required

Returns:

Type Description
ndarray

np.ndarray: The resampled audio chunk.

Source code in live_audio_capture\audio_noise_reduction.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
@staticmethod
def resample_audio(
    audio_chunk: np.ndarray,
    original_rate: int,
    target_rate: int,
) -> np.ndarray:
    """
    Resample the audio chunk to a target sample rate.

    Args:
        audio_chunk (np.ndarray): The audio chunk to resample.
        original_rate (int): The original sample rate.
        target_rate (int): The target sample rate.

    Returns:
        np.ndarray: The resampled audio chunk.
    """
    num_samples = int(len(audio_chunk) * target_rate / original_rate)
    return resample(audio_chunk, num_samples)

AudioPlayback

Utilities for playing audio files and sounds.

Source code in live_audio_capture\audio_utils\audio_playback.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
class AudioPlayback:
    """
    Utilities for playing audio files and sounds.
    """

    @staticmethod
    def play_audio_file(file_path: str) -> None:
        """
        Play an audio file using the simpleaudio library.

        Args:
            file_path (str): Path to the audio file to play.
        """
        try:
            audio = AudioSegment.from_file(file_path)
            raw_data = audio.raw_data
            play_obj = sa.play_buffer(raw_data, num_channels=audio.channels, bytes_per_sample=audio.sample_width, sample_rate=audio.frame_rate)
            play_obj.wait_done()
        except Exception as e:
            print(f"Failed to play audio file: {e}")

    @staticmethod
    def play_beep(frequency: int, duration: int) -> None:
        """
        Play a beep sound asynchronously using the simpleaudio library.

        Args:
            frequency (int): Frequency of the beep sound in Hz.
            duration (int): Duration of the beep sound in milliseconds.
        """
        try:
            sample_rate = 44100
            t = np.linspace(0, duration / 1000, int(sample_rate * duration / 1000), endpoint=False)
            waveform = np.sin(2 * np.pi * frequency * t)
            waveform = (waveform * 32767).astype(np.int16)
            play_obj = sa.play_buffer(waveform, num_channels=1, bytes_per_sample=2, sample_rate=sample_rate)
            play_obj.stop()
        except Exception as e:
            print(f"Failed to play beep sound: {e}")

play_audio_file(file_path) staticmethod

Play an audio file using the simpleaudio library.

Parameters:

Name Type Description Default
file_path str

Path to the audio file to play.

required
Source code in live_audio_capture\audio_utils\audio_playback.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
@staticmethod
def play_audio_file(file_path: str) -> None:
    """
    Play an audio file using the simpleaudio library.

    Args:
        file_path (str): Path to the audio file to play.
    """
    try:
        audio = AudioSegment.from_file(file_path)
        raw_data = audio.raw_data
        play_obj = sa.play_buffer(raw_data, num_channels=audio.channels, bytes_per_sample=audio.sample_width, sample_rate=audio.frame_rate)
        play_obj.wait_done()
    except Exception as e:
        print(f"Failed to play audio file: {e}")

play_beep(frequency, duration) staticmethod

Play a beep sound asynchronously using the simpleaudio library.

Parameters:

Name Type Description Default
frequency int

Frequency of the beep sound in Hz.

required
duration int

Duration of the beep sound in milliseconds.

required
Source code in live_audio_capture\audio_utils\audio_playback.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
@staticmethod
def play_beep(frequency: int, duration: int) -> None:
    """
    Play a beep sound asynchronously using the simpleaudio library.

    Args:
        frequency (int): Frequency of the beep sound in Hz.
        duration (int): Duration of the beep sound in milliseconds.
    """
    try:
        sample_rate = 44100
        t = np.linspace(0, duration / 1000, int(sample_rate * duration / 1000), endpoint=False)
        waveform = np.sin(2 * np.pi * frequency * t)
        waveform = (waveform * 32767).astype(np.int16)
        play_obj = sa.play_buffer(waveform, num_channels=1, bytes_per_sample=2, sample_rate=sample_rate)
        play_obj.stop()
    except Exception as e:
        print(f"Failed to play beep sound: {e}")

AudioProcessing

Utilities for processing audio data.

Source code in live_audio_capture\audio_utils\audio_processing.py
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
class AudioProcessing:
    """
    Utilities for processing audio data.
    """

    @staticmethod
    def calculate_energy(audio_chunk: np.ndarray) -> float:
        """
        Calculate the energy of an audio chunk.

        Args:
            audio_chunk (np.ndarray): The audio chunk to process.

        Returns:
            float: The energy of the audio chunk.
        """
        return np.sum(audio_chunk**2) / len(audio_chunk)

    @staticmethod
    def process_audio_chunk(raw_data: bytes, audio_format: str = "f32le") -> np.ndarray:
        """
        Convert raw audio data to a NumPy array based on the audio format.

        Args:
            raw_data (bytes): Raw audio data from the microphone.
            audio_format (str): Audio format (e.g., "f32le" or "s16le").

        Returns:
            np.ndarray: The processed audio chunk.
        """
        if audio_format == "f32le":
            return np.frombuffer(raw_data, dtype=np.float32)
        elif audio_format == "s16le":
            return np.frombuffer(raw_data, dtype=np.int16) / 32768.0  # Normalize to [-1, 1]
        else:
            raise ValueError(f"Unsupported audio format: {audio_format}")

    @staticmethod
    def apply_noise_reduction_to_file(
        input_file: str,
        output_file: str,
        stationary: bool = False,
        prop_decrease: float = 1.0,
        n_std_thresh_stationary: float = 1.5,
        n_jobs: int = 1,
        use_torch: bool = False,
        device: str = "cuda",
    ) -> None:
        """
        Apply noise reduction to an audio file and save the result.

        Args:
            input_file (str): Path to the input audio file.
            output_file (str): Path to save the processed audio file.
            stationary (bool): Whether to perform stationary noise reduction.
            prop_decrease (float): Proportion to reduce noise by (1.0 = 100%).
            n_std_thresh_stationary (float): Threshold for stationary noise reduction.
            n_jobs (int): Number of parallel jobs to run. Set to -1 to use all CPU cores.
            use_torch (bool): Whether to use the PyTorch version of spectral gating.
            device (str): Device to run the PyTorch spectral gating on (e.g., "cuda" or "cpu").
        """
        try:
            # Load the audio file using pydub
            audio = AudioSegment.from_file(input_file)

            # Convert the audio to a NumPy array
            samples = np.array(audio.get_array_of_samples())
            sample_rate = audio.frame_rate

            # Normalize the audio to the range [-1, 1]
            if audio.sample_width == 2:  # 16-bit audio
                samples = samples / 32768.0
            elif audio.sample_width == 4:  # 32-bit audio
                samples = samples / 2147483648.0

            # Apply noise reduction
            reduced_noise = nr.reduce_noise(
                y=samples,
                sr=sample_rate,
                stationary=stationary,
                prop_decrease=prop_decrease,
                n_std_thresh_stationary=n_std_thresh_stationary,
                n_jobs=n_jobs,
                use_torch=use_torch,
                device=device,
            )

            # Scale the audio back to the original range
            if audio.sample_width == 2:  # 16-bit audio
                reduced_noise = (reduced_noise * 32768.0).astype(np.int16)
            elif audio.sample_width == 4:  # 32-bit audio
                reduced_noise = (reduced_noise * 2147483648.0).astype(np.int32)

            # Convert the NumPy array back to an AudioSegment
            processed_audio = AudioSegment(
                reduced_noise.tobytes(),
                frame_rate=sample_rate,
                sample_width=audio.sample_width,
                channels=audio.channels,
            )

            # Save the processed audio to the output file
            processed_audio.export(output_file, format=output_file.split(".")[-1])
            print(f"Noise-reduced audio saved to {output_file}")

        except Exception as e:
            print(f"Failed to apply noise reduction to audio file: {e}")

apply_noise_reduction_to_file(input_file, output_file, stationary=False, prop_decrease=1.0, n_std_thresh_stationary=1.5, n_jobs=1, use_torch=False, device='cuda') staticmethod

Apply noise reduction to an audio file and save the result.

Parameters:

Name Type Description Default
input_file str

Path to the input audio file.

required
output_file str

Path to save the processed audio file.

required
stationary bool

Whether to perform stationary noise reduction.

False
prop_decrease float

Proportion to reduce noise by (1.0 = 100%).

1.0
n_std_thresh_stationary float

Threshold for stationary noise reduction.

1.5
n_jobs int

Number of parallel jobs to run. Set to -1 to use all CPU cores.

1
use_torch bool

Whether to use the PyTorch version of spectral gating.

False
device str

Device to run the PyTorch spectral gating on (e.g., "cuda" or "cpu").

'cuda'
Source code in live_audio_capture\audio_utils\audio_processing.py
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
@staticmethod
def apply_noise_reduction_to_file(
    input_file: str,
    output_file: str,
    stationary: bool = False,
    prop_decrease: float = 1.0,
    n_std_thresh_stationary: float = 1.5,
    n_jobs: int = 1,
    use_torch: bool = False,
    device: str = "cuda",
) -> None:
    """
    Apply noise reduction to an audio file and save the result.

    Args:
        input_file (str): Path to the input audio file.
        output_file (str): Path to save the processed audio file.
        stationary (bool): Whether to perform stationary noise reduction.
        prop_decrease (float): Proportion to reduce noise by (1.0 = 100%).
        n_std_thresh_stationary (float): Threshold for stationary noise reduction.
        n_jobs (int): Number of parallel jobs to run. Set to -1 to use all CPU cores.
        use_torch (bool): Whether to use the PyTorch version of spectral gating.
        device (str): Device to run the PyTorch spectral gating on (e.g., "cuda" or "cpu").
    """
    try:
        # Load the audio file using pydub
        audio = AudioSegment.from_file(input_file)

        # Convert the audio to a NumPy array
        samples = np.array(audio.get_array_of_samples())
        sample_rate = audio.frame_rate

        # Normalize the audio to the range [-1, 1]
        if audio.sample_width == 2:  # 16-bit audio
            samples = samples / 32768.0
        elif audio.sample_width == 4:  # 32-bit audio
            samples = samples / 2147483648.0

        # Apply noise reduction
        reduced_noise = nr.reduce_noise(
            y=samples,
            sr=sample_rate,
            stationary=stationary,
            prop_decrease=prop_decrease,
            n_std_thresh_stationary=n_std_thresh_stationary,
            n_jobs=n_jobs,
            use_torch=use_torch,
            device=device,
        )

        # Scale the audio back to the original range
        if audio.sample_width == 2:  # 16-bit audio
            reduced_noise = (reduced_noise * 32768.0).astype(np.int16)
        elif audio.sample_width == 4:  # 32-bit audio
            reduced_noise = (reduced_noise * 2147483648.0).astype(np.int32)

        # Convert the NumPy array back to an AudioSegment
        processed_audio = AudioSegment(
            reduced_noise.tobytes(),
            frame_rate=sample_rate,
            sample_width=audio.sample_width,
            channels=audio.channels,
        )

        # Save the processed audio to the output file
        processed_audio.export(output_file, format=output_file.split(".")[-1])
        print(f"Noise-reduced audio saved to {output_file}")

    except Exception as e:
        print(f"Failed to apply noise reduction to audio file: {e}")

calculate_energy(audio_chunk) staticmethod

Calculate the energy of an audio chunk.

Parameters:

Name Type Description Default
audio_chunk ndarray

The audio chunk to process.

required

Returns:

Name Type Description
float float

The energy of the audio chunk.

Source code in live_audio_capture\audio_utils\audio_processing.py
12
13
14
15
16
17
18
19
20
21
22
23
@staticmethod
def calculate_energy(audio_chunk: np.ndarray) -> float:
    """
    Calculate the energy of an audio chunk.

    Args:
        audio_chunk (np.ndarray): The audio chunk to process.

    Returns:
        float: The energy of the audio chunk.
    """
    return np.sum(audio_chunk**2) / len(audio_chunk)

process_audio_chunk(raw_data, audio_format='f32le') staticmethod

Convert raw audio data to a NumPy array based on the audio format.

Parameters:

Name Type Description Default
raw_data bytes

Raw audio data from the microphone.

required
audio_format str

Audio format (e.g., "f32le" or "s16le").

'f32le'

Returns:

Type Description
ndarray

np.ndarray: The processed audio chunk.

Source code in live_audio_capture\audio_utils\audio_processing.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
@staticmethod
def process_audio_chunk(raw_data: bytes, audio_format: str = "f32le") -> np.ndarray:
    """
    Convert raw audio data to a NumPy array based on the audio format.

    Args:
        raw_data (bytes): Raw audio data from the microphone.
        audio_format (str): Audio format (e.g., "f32le" or "s16le").

    Returns:
        np.ndarray: The processed audio chunk.
    """
    if audio_format == "f32le":
        return np.frombuffer(raw_data, dtype=np.float32)
    elif audio_format == "s16le":
        return np.frombuffer(raw_data, dtype=np.int16) / 32768.0  # Normalize to [-1, 1]
    else:
        raise ValueError(f"Unsupported audio format: {audio_format}")

AudioVisualizer

A standalone real-time audio visualizer using PyQtGraph with a refined color scheme.

Source code in live_audio_capture\visualization.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
class AudioVisualizer:
    """
    A standalone real-time audio visualizer using PyQtGraph with a refined color scheme.
    """

    def __init__(self, sampling_rate: int, chunk_duration: float):
        """
        Initialize the AudioVisualizer instance.

        Args:
            sampling_rate (int): The sample rate of the audio.
            chunk_duration (float): The duration of each audio chunk in seconds.
        """
        self.sampling_rate = sampling_rate
        self.chunk_duration = chunk_duration
        self.chunk_size = int(sampling_rate * chunk_duration)

        # Create a thread-safe queue for audio chunks
        self.audio_queue = queue.Queue()

        # Flag to control the visualization thread
        self.running = False

        # Start the visualization thread
        self.thread = Thread(target=self._run_visualizer, daemon=True)
        self.thread.start()

    def _run_visualizer(self):
        """Run the PyQtGraph visualizer in a separate thread."""
        # Create a PyQtGraph application
        self.app = QtWidgets.QApplication([])
        self.win = pg.GraphicsLayoutWidget(title="Real-Time Audio Visualizer")
        self.win.resize(1200, 800)  # Increase window size for additional plots
        self.win.setBackground("#1f1f1f")  # Dark gray background
        self.win.show()

        # Custom font for labels
        _ = QtGui.QFont("Arial", 12)
        pg.setConfigOptions(antialias=True, useNumba=True)

        # Create a plot for the waveform
        self.waveform_plot = self.win.addPlot(title="Waveform", row=0, col=0)
        self.waveform_curve = self.waveform_plot.plot(pen=pg.mkPen("#00ffff", width=2))  # Soft cyan
        self.waveform_plot.setYRange(-1, 1)
        self.waveform_plot.setXRange(0, self.chunk_size)
        self.waveform_plot.setTitle("Waveform", color="#ffffff", size="14pt")
        self.waveform_plot.setLabel("left", "Amplitude", color="#ffffff", **{"font-size": "12pt"})
        self.waveform_plot.setLabel("bottom", "Time (samples)", color="#ffffff", **{"font-size": "12pt"})

        # Create a plot for the frequency spectrum
        self.spectrum_plot = self.win.addPlot(title="Frequency Spectrum", row=0, col=1)
        self.spectrum_curve = self.spectrum_plot.plot(pen=pg.mkPen("#ff00ff", width=2))  # Soft magenta
        self.spectrum_plot.setLogMode(x=True, y=False)  # Logarithmic frequency axis
        self.spectrum_plot.setLabel("left", "Magnitude (dB)", color="#ffffff", **{"font-size": "12pt"})
        self.spectrum_plot.setLabel("bottom", "Frequency (Hz)", color="#ffffff", **{"font-size": "12pt"})
        self.spectrum_plot.setYRange(-100, 0)
        self.spectrum_plot.setXRange(20, self.sampling_rate / 2)  # 20 Hz to Nyquist frequency
        self.spectrum_plot.setTitle("Frequency Spectrum", color="#ffffff", size="14pt")

        # Add a peak frequency indicator
        self.peak_freq_text = pg.TextItem(anchor=(0.5, 1), color="#ff00ff")  # Soft magenta
        self.spectrum_plot.addItem(self.peak_freq_text)

        # Create a plot for the spectrogram
        self.spectrogram_plot = self.win.addPlot(title="Spectrogram", row=1, col=0, colspan=2)
        self.spectrogram_image = pg.ImageItem()
        self.spectrogram_plot.addItem(self.spectrogram_image)
        self.spectrogram_plot.setLabel("left", "Frequency (Hz)", color="#ffffff", **{"font-size": "12pt"})
        self.spectrogram_plot.setLabel("bottom", "Time (s)", color="#ffffff", **{"font-size": "12pt"})
        self.spectrogram_plot.setTitle("Spectrogram", color="#ffffff", size="14pt")

        # Set a color map for the spectrogram (deep blue to bright yellow)
        self.colormap = pg.ColorMap(
            [0.0, 1.0],  # Positions for the colors
            [
                (0, 0, 255),  # Deep blue at position 0.0
                (255, 255, 0),  # Bright yellow at position 1.0
            ]
        )
        self.spectrogram_image.setLookupTable(self.colormap.getLookupTable())

        # Initialize spectrogram data
        self.spectrogram_data = np.zeros((129, 100))  # 129 frequency bins, 100 time steps
        self.spectrogram_image.setImage(self.spectrogram_data)
        self.spectrogram_image.setLevels([-50, 0])  # Adjust levels for better contrast

        # Create a volume meter
        self.volume_meter = self.win.addPlot(title="Volume Meter", row=2, col=0)
        self.volume_bar = pg.BarGraphItem(x=[0], height=[0], width=0.6, brush="#ff8c42")  # Gradient orange
        self.volume_meter.addItem(self.volume_bar)
        self.volume_meter.setYRange(0, 1)
        self.volume_meter.setXRange(-1, 1)
        self.volume_meter.setTitle("Volume Meter", color="#ffffff", size="14pt")
        self.volume_meter.setLabel("left", "Volume", color="#ffffff", **{"font-size": "12pt"})

        # Create a volume history plot
        self.volume_history_plot = self.win.addPlot(title="Volume History", row=2, col=1)
        self.volume_history_curve = self.volume_history_plot.plot(pen=pg.mkPen("#00ff00", width=2))  # Soft green
        self.volume_history_plot.setYRange(0, 1)
        self.volume_history_plot.setXRange(0, 100)  # Show last 100 volume readings
        self.volume_history_plot.setTitle("Volume History", color="#ffffff", size="14pt")
        self.volume_history_plot.setLabel("left", "Volume", color="#ffffff", **{"font-size": "12pt"})
        self.volume_history_plot.setLabel("bottom", "Time (samples)", color="#ffffff", **{"font-size": "12pt"})

        # Timer for updating the visualization
        self.timer = QtCore.QTimer()
        self.timer.timeout.connect(self._update)
        self.timer.start(100)  # Update every 100 ms

        # Start the Qt event loop
        self.running = True
        self.app.exec()

    def _update(self):
        """Update the visualization with the latest audio chunk."""
        try:
            # Get the latest audio chunk from the queue
            audio_chunk = self.audio_queue.get_nowait()

            # Update waveform
            self.waveform_curve.setData(audio_chunk)

            # Update frequency spectrum
            spectrum = self.compute_spectrum(audio_chunk)
            if spectrum is not None:
                freqs = np.fft.rfftfreq(len(audio_chunk), 1 / self.sampling_rate)
                self.spectrum_curve.setData(freqs, spectrum)

                # Update peak frequency indicator
                peak_freq = freqs[np.argmax(spectrum)]
                self.peak_freq_text.setText(f"Peak: {peak_freq:.1f} Hz")
                self.peak_freq_text.setPos(peak_freq, np.max(spectrum))

            # Update spectrogram
            spectrogram_chunk = self.compute_spectrogram(audio_chunk)
            if spectrogram_chunk is not None:
                self.spectrogram_data = np.roll(self.spectrogram_data, -1, axis=1)
                self.spectrogram_data[:, -1] = spectrogram_chunk
                self.spectrogram_image.setImage(self.spectrogram_data, autoLevels=False)

            # Update volume meter
            volume = np.sqrt(np.mean(audio_chunk**2))  # RMS volume
            self.volume_bar.setOpts(height=[volume])

            # Update volume history
            if not hasattr(self, "volume_history"):
                self.volume_history = np.zeros(100)
            self.volume_history = np.roll(self.volume_history, -1)
            self.volume_history[-1] = volume
            self.volume_history_curve.setData(self.volume_history)

        except queue.Empty:
            # No new audio chunk available
            pass

    def compute_spectrum(self, audio_chunk: np.ndarray) -> Optional[np.ndarray]:
        """
        Compute the frequency spectrum for a given audio chunk.

        Args:
            audio_chunk (np.ndarray): The audio chunk to process.

        Returns:
            Optional[np.ndarray]: The frequency spectrum in dB.
        """
        try:
            # Compute the FFT
            fft = np.fft.rfft(audio_chunk)
            magnitude = np.abs(fft)
            return 10 * np.log10(magnitude + 1e-10)  # Convert to dB
        except Exception:
            return None

    def compute_spectrogram(self, audio_chunk: np.ndarray) -> Optional[np.ndarray]:
        """
        Compute the spectrogram for a given audio chunk.

        Args:
            audio_chunk (np.ndarray): The audio chunk to process.

        Returns:
            Optional[np.ndarray]: The spectrogram data.
        """
        try:
            _, _, Sxx = spectrogram(audio_chunk, fs=self.sampling_rate, nperseg=256)
            return 10 * np.log10(Sxx.mean(axis=1) + 1e-10)  # Convert to dB
        except ValueError:
            # Handle cases where the audio chunk is too short for the spectrogram
            return None

    def add_audio_chunk(self, audio_chunk: np.ndarray):
        """Add a new audio chunk to the visualization queue."""
        self.audio_queue.put(audio_chunk)

    def stop(self):
        """Stop the visualization."""
        self.running = False
        self.app.quit()

__init__(sampling_rate, chunk_duration)

Initialize the AudioVisualizer instance.

Parameters:

Name Type Description Default
sampling_rate int

The sample rate of the audio.

required
chunk_duration float

The duration of each audio chunk in seconds.

required
Source code in live_audio_capture\visualization.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def __init__(self, sampling_rate: int, chunk_duration: float):
    """
    Initialize the AudioVisualizer instance.

    Args:
        sampling_rate (int): The sample rate of the audio.
        chunk_duration (float): The duration of each audio chunk in seconds.
    """
    self.sampling_rate = sampling_rate
    self.chunk_duration = chunk_duration
    self.chunk_size = int(sampling_rate * chunk_duration)

    # Create a thread-safe queue for audio chunks
    self.audio_queue = queue.Queue()

    # Flag to control the visualization thread
    self.running = False

    # Start the visualization thread
    self.thread = Thread(target=self._run_visualizer, daemon=True)
    self.thread.start()

add_audio_chunk(audio_chunk)

Add a new audio chunk to the visualization queue.

Source code in live_audio_capture\visualization.py
200
201
202
def add_audio_chunk(self, audio_chunk: np.ndarray):
    """Add a new audio chunk to the visualization queue."""
    self.audio_queue.put(audio_chunk)

compute_spectrogram(audio_chunk)

Compute the spectrogram for a given audio chunk.

Parameters:

Name Type Description Default
audio_chunk ndarray

The audio chunk to process.

required

Returns:

Type Description
Optional[ndarray]

Optional[np.ndarray]: The spectrogram data.

Source code in live_audio_capture\visualization.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
def compute_spectrogram(self, audio_chunk: np.ndarray) -> Optional[np.ndarray]:
    """
    Compute the spectrogram for a given audio chunk.

    Args:
        audio_chunk (np.ndarray): The audio chunk to process.

    Returns:
        Optional[np.ndarray]: The spectrogram data.
    """
    try:
        _, _, Sxx = spectrogram(audio_chunk, fs=self.sampling_rate, nperseg=256)
        return 10 * np.log10(Sxx.mean(axis=1) + 1e-10)  # Convert to dB
    except ValueError:
        # Handle cases where the audio chunk is too short for the spectrogram
        return None

compute_spectrum(audio_chunk)

Compute the frequency spectrum for a given audio chunk.

Parameters:

Name Type Description Default
audio_chunk ndarray

The audio chunk to process.

required

Returns:

Type Description
Optional[ndarray]

Optional[np.ndarray]: The frequency spectrum in dB.

Source code in live_audio_capture\visualization.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
def compute_spectrum(self, audio_chunk: np.ndarray) -> Optional[np.ndarray]:
    """
    Compute the frequency spectrum for a given audio chunk.

    Args:
        audio_chunk (np.ndarray): The audio chunk to process.

    Returns:
        Optional[np.ndarray]: The frequency spectrum in dB.
    """
    try:
        # Compute the FFT
        fft = np.fft.rfft(audio_chunk)
        magnitude = np.abs(fft)
        return 10 * np.log10(magnitude + 1e-10)  # Convert to dB
    except Exception:
        return None

stop()

Stop the visualization.

Source code in live_audio_capture\visualization.py
204
205
206
207
def stop(self):
    """Stop the visualization."""
    self.running = False
    self.app.quit()

LiveAudioCapture

A cross-platform utility for capturing live audio from a microphone using FFmpeg. Features: - Continuous listening mode. - Dynamic recording based on voice activity. - Silence duration threshold for stopping recording. - Optional beep sounds for start/stop feedback. - Save recordings in multiple formats (WAV, MP3, OGG).

Source code in live_audio_capture\audio_capture.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
class LiveAudioCapture:
    """
    A cross-platform utility for capturing live audio from a microphone using FFmpeg.
    Features:
    - Continuous listening mode.
    - Dynamic recording based on voice activity.
    - Silence duration threshold for stopping recording.
    - Optional beep sounds for start/stop feedback.
    - Save recordings in multiple formats (WAV, MP3, OGG).
    """

    def __init__(
        self,
        sampling_rate: int = 16000,
        chunk_duration: float = 0.1,
        audio_format: str = "f32le",
        channels: int = 1,
        aggressiveness: int = 1,  # Aggressiveness level for VAD
        enable_beep: bool = True,
        enable_noise_canceling: bool = False,
        low_pass_cutoff: float = 7500.0,
        stationary_noise_reduction: bool = False,
        prop_decrease: float = 1.0,
        n_std_thresh_stationary: float = 1.5,
        n_jobs: int = 1,
        use_torch: bool = False,
        device: str = "cuda",
        calibration_duration: float = 2.0,  # Duration of calibration in seconds
        use_adaptive_threshold: bool = True,  # Enable adaptive thresholding
    ):
        """
        Initialize the LiveAudioCapture instance.

        Args:
            sampling_rate (int): Sample rate in Hz (e.g., 16000).
            chunk_duration (float): Duration of each audio chunk in seconds (e.g., 0.1).
            audio_format (str): Audio format for FFmpeg output (e.g., "f32le").
            channels (int): Number of audio channels (1 for mono, 2 for stereo).
            aggressiveness (int): Aggressiveness level for VAD (0 = least aggressive, 3 = most aggressive).
            enable_beep (bool): Whether to play beep sounds when recording starts/stops.
            enable_noise_canceling (bool): Whether to apply noise cancellation.
            low_pass_cutoff (float): Cutoff frequency for the low-pass filter.
            stationary_noise_reduction (bool): Whether to use stationary noise reduction.
            prop_decrease (float): Proportion to reduce noise by (1.0 = 100%).
            n_std_thresh_stationary (float): Threshold for stationary noise reduction.
            n_jobs (int): Number of parallel jobs to run. Set to -1 to use all CPU cores.
            use_torch (bool): Whether to use the PyTorch version of spectral gating.
            device (str): Device to run the PyTorch spectral gating on (e.g., "cuda" or "cpu").
            calibration_duration (float): Duration of the calibration phase in seconds.
            use_adaptive_threshold (bool): Whether to use adaptive thresholding for VAD.
        """
        self.sampling_rate = sampling_rate
        self.chunk_duration = chunk_duration
        self.audio_format = audio_format
        self.channels = channels
        self.enable_beep = enable_beep
        self.enable_noise_canceling = enable_noise_canceling
        self.low_pass_cutoff = low_pass_cutoff
        self.stationary_noise_reduction = stationary_noise_reduction
        self.prop_decrease = prop_decrease
        self.n_std_thresh_stationary = n_std_thresh_stationary
        self.n_jobs = n_jobs
        self.use_torch = use_torch
        self.device = device
        self.calibration_duration = calibration_duration
        self.use_adaptive_threshold = use_adaptive_threshold
        self.process: Optional[subprocess.Popen] = None
        self.is_streaming = False
        self.is_recording = False

        # Validate the cutoff frequency
        nyquist = 0.5 * self.sampling_rate
        if self.low_pass_cutoff >= nyquist:
            raise ValueError(
                f"Cutoff frequency must be less than the Nyquist frequency ({nyquist} Hz). "
                f"Provided cutoff frequency: {self.low_pass_cutoff} Hz."
            )

        # Initialize VAD
        self.vad = VoiceActivityDetector(
            sample_rate=sampling_rate,
            frame_duration=chunk_duration,
            aggressiveness=aggressiveness,
            hysteresis_high=1.5,
            hysteresis_low=0.5,
            enable_noise_canceling=self.enable_noise_canceling,
            calibration_duration=self.calibration_duration,
            use_adaptive_threshold=self.use_adaptive_threshold,
            audio_format=self.audio_format,
            channels=self.channels
        )

        # Determine the input device based on the platform
        if sys.platform == "linux":
            self.input_format = "alsa"
        elif sys.platform == "darwin":  # macOS
            self.input_format = "avfoundation"
        elif sys.platform == "win32":
            self.input_format = "dshow"
        else:
            raise RuntimeError(f"Unsupported platform: {sys.platform}")

        # Get the default microphone
        self.input_device = MicUtils.get_default_mic()
        print(f"Using input device: {self.input_device}")

    def list_available_mics(self) -> Dict[str, str]:
        """
        List all available microphones on the system.

        Returns:
            Dict[str, str]: A dictionary mapping microphone names to their device IDs.
        """
        return MicUtils.list_mics()

    def change_input_device(self, mic_name: str) -> None:
        """
        Change the input device to the specified microphone by name.

        Args:
            mic_name (str): The name of the microphone to use.
        """
        mics = self.list_available_mics()
        if mic_name not in mics:
            raise ValueError(f"Microphone '{mic_name}' not found. Available microphones: {list(mics.keys())}")
        self.input_device = mics[mic_name]
        print(f"Changed input device to: {self.input_device}")

    def play_audio_file(self, file_path: str) -> None:
        """
        Play an audio file using the simpleaudio library.

        Args:
            file_path (str): Path to the audio file to play.
        """
        AudioPlayback.play_audio_file(file_path)

    def apply_noise_reduction_to_file(
        self,
        input_file: str,
        output_file: str,
        stationary: bool = False,
        prop_decrease: float = 1.0,
        n_std_thresh_stationary: float = 1.5,
        n_jobs: int = 1,
        use_torch: bool = False,
        device: str = "cuda",
    ) -> None:
        """
        Apply noise reduction to an audio file and save the result.

        Args:
            input_file (str): Path to the input audio file.
            output_file (str): Path to save the processed audio file.
            stationary (bool): Whether to perform stationary noise reduction.
            prop_decrease (float): Proportion to reduce noise by (1.0 = 100%).
            n_std_thresh_stationary (float): Threshold for stationary noise reduction.
            n_jobs (int): Number of parallel jobs to run. Set to -1 to use all CPU cores.
            use_torch (bool): Whether to use the PyTorch version of spectral gating.
            device (str): Device to run the PyTorch spectral gating on (e.g., "cuda" or "cpu").
        """
        AudioProcessing.apply_noise_reduction_to_file(
            input_file,
            output_file,
            stationary=stationary,
            prop_decrease=prop_decrease,
            n_std_thresh_stationary=n_std_thresh_stationary,
            n_jobs=n_jobs,
            use_torch=use_torch,
            device=device,
        )

    def _start_ffmpeg_process(self) -> None:
        """Start the FFmpeg process for capturing live audio."""
        if self.process is not None:
            return  # FFmpeg process is already running

        # Calculate chunk size in bytes
        bytes_per_sample = 4 if self.audio_format == "f32le" else 2  # 32-bit float or 16-bit int
        self.chunk_size = int(self.sampling_rate * self.chunk_duration * self.channels * bytes_per_sample)

        # FFmpeg command to capture live audio
        command = [
            "ffmpeg",
            "-f", self.input_format,       # Input format (platform-specific)
            "-i", self.input_device,       # Input device (platform-specific)
            "-ar", str(self.sampling_rate),  # Sample rate
            "-ac", str(self.channels),     # Number of channels
            "-f", self.audio_format,      # Output format
            "-"                           # Output to stdout
        ]

        try:
            # Start FFmpeg process
            self.process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        except Exception as e:
            raise RuntimeError(f"Failed to start FFmpeg process: {e}")

    def stream_audio(self) -> Generator[np.ndarray, None, None]:
        """Stream live audio from the microphone."""
        self._start_ffmpeg_process()
        self.is_streaming = True

        try:
            while self.is_streaming:
                # Read raw audio data from FFmpeg stdout
                raw_data = self.process.stdout.read(self.chunk_size)
                if not raw_data:
                    # Print FFmpeg errors if no data is received
                    stderr = self.process.stderr.read().decode()
                    if stderr:
                        print("FFmpeg errors:", stderr)
                    break

                # Convert raw data to NumPy array
                if self.audio_format == "f32le":
                    audio_chunk = np.frombuffer(raw_data, dtype=np.float32)
                elif self.audio_format == "s16le":
                    audio_chunk = np.frombuffer(raw_data, dtype=np.int16) / 32768.0  # Normalize to [-1, 1]
                else:
                    raise RuntimeError(f"Unsupported audio format: {self.audio_format}")

                # Yield the audio chunk for processing
                yield audio_chunk

        except KeyboardInterrupt:
            print("\nStreaming interrupted by user.")
        finally:
            self.stop_streaming()

    def stop_streaming(self) -> None:
        """Stop the audio stream and terminate the FFmpeg process."""
        self.is_streaming = False
        if self.process:
            try:
                self.process.terminate()
                self.process.wait(timeout=5)  # Wait for the process to terminate
            except subprocess.TimeoutExpired:
                self.process.kill()  # Force kill if it doesn't terminate
            self.process = None
        print("Streaming stopped.")

    def save_recording(self, audio_data: np.ndarray, output_file: str, format: str = "wav") -> None:
        """
        Save the recorded audio to a file in the specified format.

        Args:
            audio_data (np.ndarray): The recorded audio data.
            output_file (str): Path to save the recorded audio file.
            format (str): Output format (e.g., "wav", "mp3", "ogg").
        """
        # Scale the audio data to the appropriate range
        if self.audio_format == "f32le":
            # Scale floating-point data to the range [-1, 1]
            audio_data = np.clip(audio_data, -1.0, 1.0)
            # Convert to 16-bit integer format for saving
            audio_data = (audio_data * 32767).astype(np.int16)
        elif self.audio_format == "s16le":
            # Data is already in 16-bit integer format
            audio_data = audio_data.astype(np.int16)
        else:
            raise RuntimeError(f"Unsupported audio format: {self.audio_format}")

        # Convert the NumPy array to a PyDub AudioSegment
        audio_segment = AudioSegment(
            audio_data.tobytes(),
            frame_rate=self.sampling_rate,
            sample_width=2,  # 16-bit audio (2 bytes per sample)
            channels=self.channels,
        )

        # Normalize the volume to prevent clipping
        audio_segment = audio_segment.normalize()

        # Save the audio in the specified format
        audio_segment.export(output_file, format=format)
        print(f"Recording saved to {output_file} in {format.upper()} format.")

    def process_audio_chunk(self, audio_chunk: np.ndarray, enable_noise_canceling: bool = True) -> np.ndarray:
        """
        Process an audio chunk with optional noise cancellation.

        Args:
            audio_chunk (np.ndarray): The audio chunk to process.
            enable_noise_canceling (bool): Whether to apply noise cancellation.

        Returns:
            np.ndarray: The processed audio chunk.
        """
        if enable_noise_canceling:
            # Apply noise reduction using noisereduce
            audio_chunk = AudioNoiseReduction.apply_noise_reduction(
                audio_chunk,
                self.sampling_rate,
                stationary=self.stationary_noise_reduction,
                prop_decrease=self.prop_decrease,
                n_std_thresh_stationary=self.n_std_thresh_stationary,
                n_jobs=self.n_jobs,  # Pass the number of parallel jobs
                use_torch=self.use_torch,  # Enable/disable PyTorch
                device=self.device,  # Specify the device for PyTorch
            )
            # Apply low-pass filter
            audio_chunk = AudioNoiseReduction.apply_low_pass_filter(audio_chunk, self.sampling_rate, self.low_pass_cutoff)
        return audio_chunk

    def listen_and_record_with_vad(
        self,
        output_file: str = "output.wav",
        silence_duration: float = 2.0,
        format: str = "wav"
    ) -> None:
        """
        Continuously listen to the microphone and record speech segments.

        Args:
            output_file (str): Path to save the recorded audio file.
            silence_duration (float): Duration of silence (in seconds) to stop recording.
            format (str): Output format (e.g., "wav", "mp3", "ogg").
        """
        speech_segments: List[np.ndarray] = []
        self.is_recording = False
        silent_frames = 0
        silence_threshold_frames = int(silence_duration / self.chunk_duration)

        try:
            for audio_chunk in self.stream_audio():
                # Process the audio chunk with optional noise cancellation
                processed_chunk = self.process_audio_chunk(audio_chunk, self.enable_noise_canceling)

                # Process the audio chunk with VAD
                is_speech = self.vad.process_audio(processed_chunk)

                if is_speech:
                    # Speech detected
                    if not self.is_recording:
                        print("\nStarting recording...")
                        self.is_recording = True
                        AudioPlayback.play_beep(600, 200)  # High-pitched beep for start
                    speech_segments.append(processed_chunk)
                    silent_frames = 0  # Reset silence counter
                else:
                    # Silence detected
                    if self.is_recording:
                        silent_frames += 1
                        if silent_frames >= silence_threshold_frames:
                            # Stop recording if silence exceeds the threshold
                            print("Stopping recording due to silence.")
                            self.is_recording = False
                            AudioPlayback.play_beep(300, 200)  # Low-pitched beep for stop (async)

                            # Save the recorded speech segment
                            if speech_segments:
                                combined_audio = np.concatenate(speech_segments)
                                self.save_recording(combined_audio, output_file, format=format)
                                speech_segments = []  # Reset for the next segment
                        else:
                            # Add silence to the current recording
                            speech_segments.append(processed_chunk)

        except KeyboardInterrupt:
            print("\nContinuous listening interrupted by user.")

        # Save any remaining speech segments
        if speech_segments:
            combined_audio = np.concatenate(speech_segments)
            self.save_recording(combined_audio, output_file, format=format)

    def stop(self):
        """Stop both streaming and recording."""
        self.stop_streaming()

        # Stop the recording process.
        self.is_recording = False
        print("Recording stopped.")

__init__(sampling_rate=16000, chunk_duration=0.1, audio_format='f32le', channels=1, aggressiveness=1, enable_beep=True, enable_noise_canceling=False, low_pass_cutoff=7500.0, stationary_noise_reduction=False, prop_decrease=1.0, n_std_thresh_stationary=1.5, n_jobs=1, use_torch=False, device='cuda', calibration_duration=2.0, use_adaptive_threshold=True)

Initialize the LiveAudioCapture instance.

Parameters:

Name Type Description Default
sampling_rate int

Sample rate in Hz (e.g., 16000).

16000
chunk_duration float

Duration of each audio chunk in seconds (e.g., 0.1).

0.1
audio_format str

Audio format for FFmpeg output (e.g., "f32le").

'f32le'
channels int

Number of audio channels (1 for mono, 2 for stereo).

1
aggressiveness int

Aggressiveness level for VAD (0 = least aggressive, 3 = most aggressive).

1
enable_beep bool

Whether to play beep sounds when recording starts/stops.

True
enable_noise_canceling bool

Whether to apply noise cancellation.

False
low_pass_cutoff float

Cutoff frequency for the low-pass filter.

7500.0
stationary_noise_reduction bool

Whether to use stationary noise reduction.

False
prop_decrease float

Proportion to reduce noise by (1.0 = 100%).

1.0
n_std_thresh_stationary float

Threshold for stationary noise reduction.

1.5
n_jobs int

Number of parallel jobs to run. Set to -1 to use all CPU cores.

1
use_torch bool

Whether to use the PyTorch version of spectral gating.

False
device str

Device to run the PyTorch spectral gating on (e.g., "cuda" or "cpu").

'cuda'
calibration_duration float

Duration of the calibration phase in seconds.

2.0
use_adaptive_threshold bool

Whether to use adaptive thresholding for VAD.

True
Source code in live_audio_capture\audio_capture.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def __init__(
    self,
    sampling_rate: int = 16000,
    chunk_duration: float = 0.1,
    audio_format: str = "f32le",
    channels: int = 1,
    aggressiveness: int = 1,  # Aggressiveness level for VAD
    enable_beep: bool = True,
    enable_noise_canceling: bool = False,
    low_pass_cutoff: float = 7500.0,
    stationary_noise_reduction: bool = False,
    prop_decrease: float = 1.0,
    n_std_thresh_stationary: float = 1.5,
    n_jobs: int = 1,
    use_torch: bool = False,
    device: str = "cuda",
    calibration_duration: float = 2.0,  # Duration of calibration in seconds
    use_adaptive_threshold: bool = True,  # Enable adaptive thresholding
):
    """
    Initialize the LiveAudioCapture instance.

    Args:
        sampling_rate (int): Sample rate in Hz (e.g., 16000).
        chunk_duration (float): Duration of each audio chunk in seconds (e.g., 0.1).
        audio_format (str): Audio format for FFmpeg output (e.g., "f32le").
        channels (int): Number of audio channels (1 for mono, 2 for stereo).
        aggressiveness (int): Aggressiveness level for VAD (0 = least aggressive, 3 = most aggressive).
        enable_beep (bool): Whether to play beep sounds when recording starts/stops.
        enable_noise_canceling (bool): Whether to apply noise cancellation.
        low_pass_cutoff (float): Cutoff frequency for the low-pass filter.
        stationary_noise_reduction (bool): Whether to use stationary noise reduction.
        prop_decrease (float): Proportion to reduce noise by (1.0 = 100%).
        n_std_thresh_stationary (float): Threshold for stationary noise reduction.
        n_jobs (int): Number of parallel jobs to run. Set to -1 to use all CPU cores.
        use_torch (bool): Whether to use the PyTorch version of spectral gating.
        device (str): Device to run the PyTorch spectral gating on (e.g., "cuda" or "cpu").
        calibration_duration (float): Duration of the calibration phase in seconds.
        use_adaptive_threshold (bool): Whether to use adaptive thresholding for VAD.
    """
    self.sampling_rate = sampling_rate
    self.chunk_duration = chunk_duration
    self.audio_format = audio_format
    self.channels = channels
    self.enable_beep = enable_beep
    self.enable_noise_canceling = enable_noise_canceling
    self.low_pass_cutoff = low_pass_cutoff
    self.stationary_noise_reduction = stationary_noise_reduction
    self.prop_decrease = prop_decrease
    self.n_std_thresh_stationary = n_std_thresh_stationary
    self.n_jobs = n_jobs
    self.use_torch = use_torch
    self.device = device
    self.calibration_duration = calibration_duration
    self.use_adaptive_threshold = use_adaptive_threshold
    self.process: Optional[subprocess.Popen] = None
    self.is_streaming = False
    self.is_recording = False

    # Validate the cutoff frequency
    nyquist = 0.5 * self.sampling_rate
    if self.low_pass_cutoff >= nyquist:
        raise ValueError(
            f"Cutoff frequency must be less than the Nyquist frequency ({nyquist} Hz). "
            f"Provided cutoff frequency: {self.low_pass_cutoff} Hz."
        )

    # Initialize VAD
    self.vad = VoiceActivityDetector(
        sample_rate=sampling_rate,
        frame_duration=chunk_duration,
        aggressiveness=aggressiveness,
        hysteresis_high=1.5,
        hysteresis_low=0.5,
        enable_noise_canceling=self.enable_noise_canceling,
        calibration_duration=self.calibration_duration,
        use_adaptive_threshold=self.use_adaptive_threshold,
        audio_format=self.audio_format,
        channels=self.channels
    )

    # Determine the input device based on the platform
    if sys.platform == "linux":
        self.input_format = "alsa"
    elif sys.platform == "darwin":  # macOS
        self.input_format = "avfoundation"
    elif sys.platform == "win32":
        self.input_format = "dshow"
    else:
        raise RuntimeError(f"Unsupported platform: {sys.platform}")

    # Get the default microphone
    self.input_device = MicUtils.get_default_mic()
    print(f"Using input device: {self.input_device}")

apply_noise_reduction_to_file(input_file, output_file, stationary=False, prop_decrease=1.0, n_std_thresh_stationary=1.5, n_jobs=1, use_torch=False, device='cuda')

Apply noise reduction to an audio file and save the result.

Parameters:

Name Type Description Default
input_file str

Path to the input audio file.

required
output_file str

Path to save the processed audio file.

required
stationary bool

Whether to perform stationary noise reduction.

False
prop_decrease float

Proportion to reduce noise by (1.0 = 100%).

1.0
n_std_thresh_stationary float

Threshold for stationary noise reduction.

1.5
n_jobs int

Number of parallel jobs to run. Set to -1 to use all CPU cores.

1
use_torch bool

Whether to use the PyTorch version of spectral gating.

False
device str

Device to run the PyTorch spectral gating on (e.g., "cuda" or "cpu").

'cuda'
Source code in live_audio_capture\audio_capture.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def apply_noise_reduction_to_file(
    self,
    input_file: str,
    output_file: str,
    stationary: bool = False,
    prop_decrease: float = 1.0,
    n_std_thresh_stationary: float = 1.5,
    n_jobs: int = 1,
    use_torch: bool = False,
    device: str = "cuda",
) -> None:
    """
    Apply noise reduction to an audio file and save the result.

    Args:
        input_file (str): Path to the input audio file.
        output_file (str): Path to save the processed audio file.
        stationary (bool): Whether to perform stationary noise reduction.
        prop_decrease (float): Proportion to reduce noise by (1.0 = 100%).
        n_std_thresh_stationary (float): Threshold for stationary noise reduction.
        n_jobs (int): Number of parallel jobs to run. Set to -1 to use all CPU cores.
        use_torch (bool): Whether to use the PyTorch version of spectral gating.
        device (str): Device to run the PyTorch spectral gating on (e.g., "cuda" or "cpu").
    """
    AudioProcessing.apply_noise_reduction_to_file(
        input_file,
        output_file,
        stationary=stationary,
        prop_decrease=prop_decrease,
        n_std_thresh_stationary=n_std_thresh_stationary,
        n_jobs=n_jobs,
        use_torch=use_torch,
        device=device,
    )

change_input_device(mic_name)

Change the input device to the specified microphone by name.

Parameters:

Name Type Description Default
mic_name str

The name of the microphone to use.

required
Source code in live_audio_capture\audio_capture.py
128
129
130
131
132
133
134
135
136
137
138
139
def change_input_device(self, mic_name: str) -> None:
    """
    Change the input device to the specified microphone by name.

    Args:
        mic_name (str): The name of the microphone to use.
    """
    mics = self.list_available_mics()
    if mic_name not in mics:
        raise ValueError(f"Microphone '{mic_name}' not found. Available microphones: {list(mics.keys())}")
    self.input_device = mics[mic_name]
    print(f"Changed input device to: {self.input_device}")

list_available_mics()

List all available microphones on the system.

Returns:

Type Description
Dict[str, str]

Dict[str, str]: A dictionary mapping microphone names to their device IDs.

Source code in live_audio_capture\audio_capture.py
119
120
121
122
123
124
125
126
def list_available_mics(self) -> Dict[str, str]:
    """
    List all available microphones on the system.

    Returns:
        Dict[str, str]: A dictionary mapping microphone names to their device IDs.
    """
    return MicUtils.list_mics()

listen_and_record_with_vad(output_file='output.wav', silence_duration=2.0, format='wav')

Continuously listen to the microphone and record speech segments.

Parameters:

Name Type Description Default
output_file str

Path to save the recorded audio file.

'output.wav'
silence_duration float

Duration of silence (in seconds) to stop recording.

2.0
format str

Output format (e.g., "wav", "mp3", "ogg").

'wav'
Source code in live_audio_capture\audio_capture.py
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
def listen_and_record_with_vad(
    self,
    output_file: str = "output.wav",
    silence_duration: float = 2.0,
    format: str = "wav"
) -> None:
    """
    Continuously listen to the microphone and record speech segments.

    Args:
        output_file (str): Path to save the recorded audio file.
        silence_duration (float): Duration of silence (in seconds) to stop recording.
        format (str): Output format (e.g., "wav", "mp3", "ogg").
    """
    speech_segments: List[np.ndarray] = []
    self.is_recording = False
    silent_frames = 0
    silence_threshold_frames = int(silence_duration / self.chunk_duration)

    try:
        for audio_chunk in self.stream_audio():
            # Process the audio chunk with optional noise cancellation
            processed_chunk = self.process_audio_chunk(audio_chunk, self.enable_noise_canceling)

            # Process the audio chunk with VAD
            is_speech = self.vad.process_audio(processed_chunk)

            if is_speech:
                # Speech detected
                if not self.is_recording:
                    print("\nStarting recording...")
                    self.is_recording = True
                    AudioPlayback.play_beep(600, 200)  # High-pitched beep for start
                speech_segments.append(processed_chunk)
                silent_frames = 0  # Reset silence counter
            else:
                # Silence detected
                if self.is_recording:
                    silent_frames += 1
                    if silent_frames >= silence_threshold_frames:
                        # Stop recording if silence exceeds the threshold
                        print("Stopping recording due to silence.")
                        self.is_recording = False
                        AudioPlayback.play_beep(300, 200)  # Low-pitched beep for stop (async)

                        # Save the recorded speech segment
                        if speech_segments:
                            combined_audio = np.concatenate(speech_segments)
                            self.save_recording(combined_audio, output_file, format=format)
                            speech_segments = []  # Reset for the next segment
                    else:
                        # Add silence to the current recording
                        speech_segments.append(processed_chunk)

    except KeyboardInterrupt:
        print("\nContinuous listening interrupted by user.")

    # Save any remaining speech segments
    if speech_segments:
        combined_audio = np.concatenate(speech_segments)
        self.save_recording(combined_audio, output_file, format=format)

play_audio_file(file_path)

Play an audio file using the simpleaudio library.

Parameters:

Name Type Description Default
file_path str

Path to the audio file to play.

required
Source code in live_audio_capture\audio_capture.py
141
142
143
144
145
146
147
148
def play_audio_file(self, file_path: str) -> None:
    """
    Play an audio file using the simpleaudio library.

    Args:
        file_path (str): Path to the audio file to play.
    """
    AudioPlayback.play_audio_file(file_path)

process_audio_chunk(audio_chunk, enable_noise_canceling=True)

Process an audio chunk with optional noise cancellation.

Parameters:

Name Type Description Default
audio_chunk ndarray

The audio chunk to process.

required
enable_noise_canceling bool

Whether to apply noise cancellation.

True

Returns:

Type Description
ndarray

np.ndarray: The processed audio chunk.

Source code in live_audio_capture\audio_capture.py
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
def process_audio_chunk(self, audio_chunk: np.ndarray, enable_noise_canceling: bool = True) -> np.ndarray:
    """
    Process an audio chunk with optional noise cancellation.

    Args:
        audio_chunk (np.ndarray): The audio chunk to process.
        enable_noise_canceling (bool): Whether to apply noise cancellation.

    Returns:
        np.ndarray: The processed audio chunk.
    """
    if enable_noise_canceling:
        # Apply noise reduction using noisereduce
        audio_chunk = AudioNoiseReduction.apply_noise_reduction(
            audio_chunk,
            self.sampling_rate,
            stationary=self.stationary_noise_reduction,
            prop_decrease=self.prop_decrease,
            n_std_thresh_stationary=self.n_std_thresh_stationary,
            n_jobs=self.n_jobs,  # Pass the number of parallel jobs
            use_torch=self.use_torch,  # Enable/disable PyTorch
            device=self.device,  # Specify the device for PyTorch
        )
        # Apply low-pass filter
        audio_chunk = AudioNoiseReduction.apply_low_pass_filter(audio_chunk, self.sampling_rate, self.low_pass_cutoff)
    return audio_chunk

save_recording(audio_data, output_file, format='wav')

Save the recorded audio to a file in the specified format.

Parameters:

Name Type Description Default
audio_data ndarray

The recorded audio data.

required
output_file str

Path to save the recorded audio file.

required
format str

Output format (e.g., "wav", "mp3", "ogg").

'wav'
Source code in live_audio_capture\audio_capture.py
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
def save_recording(self, audio_data: np.ndarray, output_file: str, format: str = "wav") -> None:
    """
    Save the recorded audio to a file in the specified format.

    Args:
        audio_data (np.ndarray): The recorded audio data.
        output_file (str): Path to save the recorded audio file.
        format (str): Output format (e.g., "wav", "mp3", "ogg").
    """
    # Scale the audio data to the appropriate range
    if self.audio_format == "f32le":
        # Scale floating-point data to the range [-1, 1]
        audio_data = np.clip(audio_data, -1.0, 1.0)
        # Convert to 16-bit integer format for saving
        audio_data = (audio_data * 32767).astype(np.int16)
    elif self.audio_format == "s16le":
        # Data is already in 16-bit integer format
        audio_data = audio_data.astype(np.int16)
    else:
        raise RuntimeError(f"Unsupported audio format: {self.audio_format}")

    # Convert the NumPy array to a PyDub AudioSegment
    audio_segment = AudioSegment(
        audio_data.tobytes(),
        frame_rate=self.sampling_rate,
        sample_width=2,  # 16-bit audio (2 bytes per sample)
        channels=self.channels,
    )

    # Normalize the volume to prevent clipping
    audio_segment = audio_segment.normalize()

    # Save the audio in the specified format
    audio_segment.export(output_file, format=format)
    print(f"Recording saved to {output_file} in {format.upper()} format.")

stop()

Stop both streaming and recording.

Source code in live_audio_capture\audio_capture.py
380
381
382
383
384
385
386
def stop(self):
    """Stop both streaming and recording."""
    self.stop_streaming()

    # Stop the recording process.
    self.is_recording = False
    print("Recording stopped.")

stop_streaming()

Stop the audio stream and terminate the FFmpeg process.

Source code in live_audio_capture\audio_capture.py
243
244
245
246
247
248
249
250
251
252
253
def stop_streaming(self) -> None:
    """Stop the audio stream and terminate the FFmpeg process."""
    self.is_streaming = False
    if self.process:
        try:
            self.process.terminate()
            self.process.wait(timeout=5)  # Wait for the process to terminate
        except subprocess.TimeoutExpired:
            self.process.kill()  # Force kill if it doesn't terminate
        self.process = None
    print("Streaming stopped.")

stream_audio()

Stream live audio from the microphone.

Source code in live_audio_capture\audio_capture.py
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
def stream_audio(self) -> Generator[np.ndarray, None, None]:
    """Stream live audio from the microphone."""
    self._start_ffmpeg_process()
    self.is_streaming = True

    try:
        while self.is_streaming:
            # Read raw audio data from FFmpeg stdout
            raw_data = self.process.stdout.read(self.chunk_size)
            if not raw_data:
                # Print FFmpeg errors if no data is received
                stderr = self.process.stderr.read().decode()
                if stderr:
                    print("FFmpeg errors:", stderr)
                break

            # Convert raw data to NumPy array
            if self.audio_format == "f32le":
                audio_chunk = np.frombuffer(raw_data, dtype=np.float32)
            elif self.audio_format == "s16le":
                audio_chunk = np.frombuffer(raw_data, dtype=np.int16) / 32768.0  # Normalize to [-1, 1]
            else:
                raise RuntimeError(f"Unsupported audio format: {self.audio_format}")

            # Yield the audio chunk for processing
            yield audio_chunk

    except KeyboardInterrupt:
        print("\nStreaming interrupted by user.")
    finally:
        self.stop_streaming()

MicUtils

Utilities for managing and interacting with microphones.

Source code in live_audio_capture\audio_utils\mic_utils.py
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
class MicUtils:
    """
    Utilities for managing and interacting with microphones.
    """

    @staticmethod
    def list_mics() -> Dict[str, str]:
        """
        List all available microphones on the system.

        Returns:
            Dict[str, str]: A dictionary mapping microphone names to their OS-specific device IDs.
        """
        if sys.platform == "linux":
            return MicUtils._list_mics_linux()
        elif sys.platform == "darwin":  # macOS
            return MicUtils._list_mics_mac()
        elif sys.platform == "win32":
            return MicUtils._list_mics_windows()
        else:
            raise RuntimeError(f"Unsupported platform: {sys.platform}")

    @staticmethod
    def _list_mics_linux() -> Dict[str, str]:
        """List microphones on Linux using ALSA."""
        try:
            result = subprocess.run(["arecord", "-l"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding="utf-8", errors="replace")
            if result.returncode != 0:
                raise RuntimeError("Failed to list audio devices using arecord.")

            mics = {}
            lines = result.stdout.splitlines()
            for line in lines:
                if "card" in line and "device" in line:
                    match = re.search(r"card (\d+):.*device (\d+):", line)
                    if match:
                        card, device = match.groups()
                        mic_name = f"Card {card}, Device {device}"
                        mics[mic_name] = f"hw:{card},{device}"
            return mics
        except Exception as e:
            print(f"Error listing microphones: {e}")
            return {}

    @staticmethod
    def _list_mics_mac() -> Dict[str, str]:
        """List microphones on macOS using AVFoundation."""
        try:
            result = subprocess.run(
                ["ffmpeg", "-f", "avfoundation", "-list_devices", "true", "-i", ""],
                stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding="utf-8", errors="replace"
            )
            if result.returncode != 0:
                raise RuntimeError("Failed to list audio devices using ffmpeg.")

            mics = {}
            lines = result.stderr.splitlines()
            for line in lines:
                if "AVFoundation audio devices" in line:
                    continue
                if "[AVFoundation input device" in line and ("Microphone" in line or "Built-in" in line):
                    match = re.search(r"\[(\d+)\]", line)
                    if match:
                        device_id = match.group(1)
                        mic_name = line.split("]")[1].strip()
                        mics[mic_name] = f":{device_id}"
            return mics
        except Exception as e:
            print(f"Error listing microphones: {e}")
            return {}

    @staticmethod
    def _list_mics_windows() -> Dict[str, str]:
        """List microphones on Windows using DirectShow."""
        try:
            result = subprocess.run(
                ["ffmpeg", "-f", "dshow", "-list_devices", "true", "-i", "dummy"],
                stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding="utf-8", errors="replace"
            )
            if result.returncode != 0:
                raise RuntimeError("Failed to list audio devices using ffmpeg.")

            mics = {}
            lines = result.stderr.splitlines()
            for line in lines:
                if "DirectShow audio devices" in line:
                    continue
                if "microphone" in line.lower() or "Microphone Array" in line:
                    match = re.search(r'"(.*)"', line)
                    if match:
                        mic_name = match.group(1)
                        mics[mic_name] = f"audio={mic_name}"
            return mics
        except Exception as e:
            print(f"Error listing microphones: {e}")
            return {}

    @staticmethod
    def get_default_mic() -> str:
        """Get the default microphone device based on the platform."""
        if sys.platform == "linux":
            return MicUtils._get_default_mic_linux()
        elif sys.platform == "darwin":  # macOS
            return MicUtils._get_default_mic_mac()
        elif sys.platform == "win32":
            return MicUtils._get_default_mic_windows()
        else:
            raise RuntimeError(f"Unsupported platform: {sys.platform}")

    @staticmethod
    def _get_default_mic_linux() -> str:
        """Get the default microphone device on Linux using ALSA."""
        try:
            result = subprocess.run(["arecord", "-l"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding="utf-8", errors="replace")
            if result.returncode != 0:
                raise RuntimeError("Failed to list audio devices using arecord.")

            lines = result.stdout.splitlines()
            for line in lines:
                if "card" in line and "device" in line:
                    match = re.search(r"card (\d+):.*device (\d+):", line)
                    if match:
                        card, device = match.groups()
                        return f"hw:{card},{device}"
        except Exception as e:
            print(f"Error detecting default microphone: {e}")
        return "default"  # Fallback to default

    @staticmethod
    def _get_default_mic_mac() -> str:
        """Get the default microphone device on macOS using AVFoundation."""
        try:
            result = subprocess.run(
                ["ffmpeg", "-f", "avfoundation", "-list_devices", "true", "-i", ""],
                stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding="utf-8", errors="replace"
            )
            if result.returncode != 0:
                raise RuntimeError("Failed to list audio devices using ffmpeg.")

            lines = result.stderr.splitlines()
            for line in lines:
                if "AVFoundation audio devices" in line:
                    continue
                if "[AVFoundation input device" in line and "Microphone" in line:
                    match = re.search(r"\[(\d+)\]", line)
                    if match:
                        return f":{match.group(1)}"  # Format for macOS
        except Exception as e:
            print(f"Error detecting default microphone: {e}")
        return ":0"  # Fallback to default

    @staticmethod
    def _get_default_mic_windows() -> str:
        """Get the default microphone device on Windows using DirectShow."""
        try:
            result = subprocess.run(
                ["ffmpeg", "-f", "dshow", "-list_devices", "true", "-i", "dummy"],
                stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding="utf-8", errors="replace"
            )
            if result.returncode != 0:
                raise RuntimeError("Failed to list audio devices using ffmpeg.")

            lines = result.stderr.splitlines()
            for line in lines:
                if "DirectShow audio devices" in line:
                    continue
                if "microphone" in line or "Microphone" in line or "Microphone Array" in line:
                    match = re.search(r'"(.*)"', line)
                    if match:
                        return f"audio={match.group(1)}"  # Format for Windows
        except Exception as e:
            print(f"Error detecting default microphone: {e}")
        return "audio=Microphone"  # Fallback to default

get_default_mic() staticmethod

Get the default microphone device based on the platform.

Source code in live_audio_capture\audio_utils\mic_utils.py
105
106
107
108
109
110
111
112
113
114
115
@staticmethod
def get_default_mic() -> str:
    """Get the default microphone device based on the platform."""
    if sys.platform == "linux":
        return MicUtils._get_default_mic_linux()
    elif sys.platform == "darwin":  # macOS
        return MicUtils._get_default_mic_mac()
    elif sys.platform == "win32":
        return MicUtils._get_default_mic_windows()
    else:
        raise RuntimeError(f"Unsupported platform: {sys.platform}")

list_mics() staticmethod

List all available microphones on the system.

Returns:

Type Description
Dict[str, str]

Dict[str, str]: A dictionary mapping microphone names to their OS-specific device IDs.

Source code in live_audio_capture\audio_utils\mic_utils.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
@staticmethod
def list_mics() -> Dict[str, str]:
    """
    List all available microphones on the system.

    Returns:
        Dict[str, str]: A dictionary mapping microphone names to their OS-specific device IDs.
    """
    if sys.platform == "linux":
        return MicUtils._list_mics_linux()
    elif sys.platform == "darwin":  # macOS
        return MicUtils._list_mics_mac()
    elif sys.platform == "win32":
        return MicUtils._list_mics_windows()
    else:
        raise RuntimeError(f"Unsupported platform: {sys.platform}")

VoiceActivityDetector

A simplified voice activity detector (VAD) similar to WebRTC VAD. Features: - Energy-based speech detection. - Aggressiveness level to control detection strictness. - Hysteresis for stable speech detection.

Source code in live_audio_capture\vad.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
class VoiceActivityDetector:
    """
    A simplified voice activity detector (VAD) similar to WebRTC VAD.
    Features:
    - Energy-based speech detection.
    - Aggressiveness level to control detection strictness.
    - Hysteresis for stable speech detection.
    """

    def __init__(
        self,
        sample_rate: int = 16000,
        frame_duration: float = 0.03,
        aggressiveness: int = 1,  # Aggressiveness level (0, 1, 2, or 3)
        hysteresis_high: float = 1.5,
        hysteresis_low: float = 0.5,
        enable_noise_canceling: bool = False,
        calibration_duration: float = 2.0,  # Duration of calibration in seconds
        use_adaptive_threshold: bool = True,  # Enable adaptive thresholding
        audio_format: str = "f32le",  # Audio format for calibration
        channels: int = 1
    ):
        """
        Initialize the VoiceActivityDetector.

        Args:
            sample_rate (int): Sample rate of the audio (default: 16000 Hz).
            frame_duration (float): Duration of each frame in seconds (default: 0.03 seconds).
            aggressiveness (int): Aggressiveness level (0 = least aggressive, 3 = most aggressive).
            hysteresis_high (float): Multiplier for the threshold when speech is detected.
            hysteresis_low (float): Multiplier for the threshold when speech is not detected.
            enable_noise_canceling (bool): Whether to apply noise cancellation.
            calibration_duration (float): Duration of the calibration phase in seconds.
            use_adaptive_threshold (bool): Whether to use adaptive thresholding.
            audio_format (str): Audio format for calibration (e.g., "f32le" or "s16le").
            channels (int): Number of audio channels (1 for mono, 2 for stereo).
        """
        self.sample_rate = sample_rate
        self.frame_duration = frame_duration
        self.frame_size = int(sample_rate * frame_duration)
        self.aggressiveness = aggressiveness
        self.hysteresis_high = hysteresis_high
        self.hysteresis_low = hysteresis_low
        self.enable_noise_canceling = enable_noise_canceling
        self.calibration_duration = calibration_duration
        self.use_adaptive_threshold = use_adaptive_threshold
        self.audio_format = audio_format
        self.channels = channels

        # Calibrate the initial threshold
        self.initial_threshold = self._calibrate_threshold() if self.use_adaptive_threshold else self._get_manual_threshold()
        self.current_threshold = self.initial_threshold
        self.speech_active = False

        print(f"Initialized VAD with aggressiveness={aggressiveness}, initial_threshold={self.initial_threshold:.6f}")

    def _calibrate_threshold(self) -> float:
        """
        Calibrate the initial energy threshold based on the background noise level.

        Returns:
            float: Calibrated initial energy threshold.
        """
        print("Calibrating threshold... Please remain silent for a few seconds.")
        audio_chunks = self._capture_calibration_audio()
        background_energy = np.mean([AudioProcessing.calculate_energy(chunk) for chunk in audio_chunks])
        print(f"Calibration complete. Background energy: {background_energy:.6f}")

        # Define multipliers based on aggressiveness
        multipliers = {
            0: 1.5,  # Least aggressive
            1: 2.0,
            2: 2.5,
            3: 3.0,  # Most aggressive
        }
        return background_energy * multipliers.get(self.aggressiveness, 2.0)

    def _capture_calibration_audio(self) -> List[np.ndarray]:
        """
        Capture a short audio sample for calibration.

        Returns:
            List[np.ndarray]: List of audio chunks captured during calibration.
        """
        # Start FFmpeg process for calibration
        command = [
            "ffmpeg",
            "-f", "alsa" if sys.platform == "linux" else "avfoundation" if sys.platform == "darwin" else "dshow",
            "-i", MicUtils.get_default_mic(),
            "-ar", str(self.sample_rate),
            "-ac", str(self.channels),
            "-f", self.audio_format,
            "-"
        ]
        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

        # Capture audio chunks for the calibration duration
        audio_chunks = []
        for _ in range(int(self.calibration_duration / self.frame_duration)):
            raw_data = process.stdout.read(self.frame_size * 4)  # 4 bytes per sample for f32le
            if not raw_data:
                break
            audio_chunk = AudioProcessing.process_audio_chunk(raw_data, self.audio_format)
            audio_chunks.append(audio_chunk)

        # Stop the FFmpeg process
        process.terminate()
        process.wait()

        return audio_chunks

    def _get_manual_threshold(self) -> float:
        """
        Get the initial energy threshold based on the aggressiveness level (manual values).

        Returns:
            float: Initial energy threshold.
        """
        # Manual thresholds
        if self.aggressiveness == 0:
            return 0.0005 if not self.enable_noise_canceling else 0.00002  # Least aggressive (lowest threshold)
        elif self.aggressiveness == 1:
            return 0.001 if not self.enable_noise_canceling else 0.00003
        elif self.aggressiveness == 2:
            return 0.002 if not self.enable_noise_canceling else 0.00004
        elif self.aggressiveness == 3:
            return 0.005 if not self.enable_noise_canceling else 0.0001  # Most aggressive (highest threshold)
        else:
            raise ValueError("Aggressiveness must be between 0 and 3.")

    def process_audio(self, audio_chunk: np.ndarray) -> bool:
        """
        Process an audio chunk and determine if speech is detected.

        Args:
            audio_chunk (np.ndarray): Audio chunk to process.

        Returns:
            bool: True if speech is detected, False otherwise.
        """
        # Calculate energy
        energy = AudioProcessing.calculate_energy(audio_chunk)

        # Detect speech based on energy
        if energy > self.current_threshold:
            self.speech_active = True
            print("Speech detected!")
        else:
            self.speech_active = False
            print("No speech detected.")

        # Update threshold using hysteresis
        self._update_threshold()

        # Debugging: Print key values
        print(
            f"Energy: {energy:.6f}, Current Threshold: {self.current_threshold:.6f}, "
            f"Speech Active: {self.speech_active}"
        )

        return self.speech_active

    def _update_threshold(self) -> None:
        """
        Update the energy threshold using hysteresis.
        """
        if self.speech_active:
            # Increase threshold slightly to avoid false positives
            self.current_threshold = self.initial_threshold * self.hysteresis_high
        else:
            # Lower threshold to detect speech more sensitively
            self.current_threshold = self.initial_threshold * self.hysteresis_low

__init__(sample_rate=16000, frame_duration=0.03, aggressiveness=1, hysteresis_high=1.5, hysteresis_low=0.5, enable_noise_canceling=False, calibration_duration=2.0, use_adaptive_threshold=True, audio_format='f32le', channels=1)

Initialize the VoiceActivityDetector.

Parameters:

Name Type Description Default
sample_rate int

Sample rate of the audio (default: 16000 Hz).

16000
frame_duration float

Duration of each frame in seconds (default: 0.03 seconds).

0.03
aggressiveness int

Aggressiveness level (0 = least aggressive, 3 = most aggressive).

1
hysteresis_high float

Multiplier for the threshold when speech is detected.

1.5
hysteresis_low float

Multiplier for the threshold when speech is not detected.

0.5
enable_noise_canceling bool

Whether to apply noise cancellation.

False
calibration_duration float

Duration of the calibration phase in seconds.

2.0
use_adaptive_threshold bool

Whether to use adaptive thresholding.

True
audio_format str

Audio format for calibration (e.g., "f32le" or "s16le").

'f32le'
channels int

Number of audio channels (1 for mono, 2 for stereo).

1
Source code in live_audio_capture\vad.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def __init__(
    self,
    sample_rate: int = 16000,
    frame_duration: float = 0.03,
    aggressiveness: int = 1,  # Aggressiveness level (0, 1, 2, or 3)
    hysteresis_high: float = 1.5,
    hysteresis_low: float = 0.5,
    enable_noise_canceling: bool = False,
    calibration_duration: float = 2.0,  # Duration of calibration in seconds
    use_adaptive_threshold: bool = True,  # Enable adaptive thresholding
    audio_format: str = "f32le",  # Audio format for calibration
    channels: int = 1
):
    """
    Initialize the VoiceActivityDetector.

    Args:
        sample_rate (int): Sample rate of the audio (default: 16000 Hz).
        frame_duration (float): Duration of each frame in seconds (default: 0.03 seconds).
        aggressiveness (int): Aggressiveness level (0 = least aggressive, 3 = most aggressive).
        hysteresis_high (float): Multiplier for the threshold when speech is detected.
        hysteresis_low (float): Multiplier for the threshold when speech is not detected.
        enable_noise_canceling (bool): Whether to apply noise cancellation.
        calibration_duration (float): Duration of the calibration phase in seconds.
        use_adaptive_threshold (bool): Whether to use adaptive thresholding.
        audio_format (str): Audio format for calibration (e.g., "f32le" or "s16le").
        channels (int): Number of audio channels (1 for mono, 2 for stereo).
    """
    self.sample_rate = sample_rate
    self.frame_duration = frame_duration
    self.frame_size = int(sample_rate * frame_duration)
    self.aggressiveness = aggressiveness
    self.hysteresis_high = hysteresis_high
    self.hysteresis_low = hysteresis_low
    self.enable_noise_canceling = enable_noise_canceling
    self.calibration_duration = calibration_duration
    self.use_adaptive_threshold = use_adaptive_threshold
    self.audio_format = audio_format
    self.channels = channels

    # Calibrate the initial threshold
    self.initial_threshold = self._calibrate_threshold() if self.use_adaptive_threshold else self._get_manual_threshold()
    self.current_threshold = self.initial_threshold
    self.speech_active = False

    print(f"Initialized VAD with aggressiveness={aggressiveness}, initial_threshold={self.initial_threshold:.6f}")

process_audio(audio_chunk)

Process an audio chunk and determine if speech is detected.

Parameters:

Name Type Description Default
audio_chunk ndarray

Audio chunk to process.

required

Returns:

Name Type Description
bool bool

True if speech is detected, False otherwise.

Source code in live_audio_capture\vad.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def process_audio(self, audio_chunk: np.ndarray) -> bool:
    """
    Process an audio chunk and determine if speech is detected.

    Args:
        audio_chunk (np.ndarray): Audio chunk to process.

    Returns:
        bool: True if speech is detected, False otherwise.
    """
    # Calculate energy
    energy = AudioProcessing.calculate_energy(audio_chunk)

    # Detect speech based on energy
    if energy > self.current_threshold:
        self.speech_active = True
        print("Speech detected!")
    else:
        self.speech_active = False
        print("No speech detected.")

    # Update threshold using hysteresis
    self._update_threshold()

    # Debugging: Print key values
    print(
        f"Energy: {energy:.6f}, Current Threshold: {self.current_threshold:.6f}, "
        f"Speech Active: {self.speech_active}"
    )

    return self.speech_active