diff --git a/pydub/silence.py b/pydub/silence.py index 0ad14999..a3b4ad5d 100644 --- a/pydub/silence.py +++ b/pydub/silence.py @@ -2,73 +2,162 @@ Various functions for finding/manipulating silence in AudioSegments """ import itertools +import numpy as np from .utils import db_to_float -def detect_silence(audio_segment, min_silence_len=1000, silence_thresh=-16, seek_step=1): +def _convert_to_numpy(audio_segment): + """ + Returns a numpy array view of the raw samples of an AudioSegment, + with shape (number of frames, channels). + + Does not allocate any additional memory. + + audio_segment - the segment to convert into a numpy array + """ + dtype = { + 1: np.int8, + 2: np.int16, + 4: np.int32 + }[audio_segment.sample_width] + x = np.frombuffer(audio_segment.raw_data, dtype=dtype).reshape(-1, audio_segment.channels) + return x + + +def detect_silence(audio_segment, min_silence_len=1000, silence_thresh=-16, seek_step=1, max_buffer_size_kb=100*1024): """ Returns a list of all silent sections [start, end] in milliseconds of audio_segment. - Inverse of detect_nonsilent() + Inverse of detect_nonsilent(). audio_segment - the segment to find silence in min_silence_len - the minimum length for any silent section silence_thresh - the upper bound for how quiet is silent in dFBS seek_step - step size for interating over the segment in ms + max_buffer_size_kb - the maximum size of internally allocated buffers in KiB """ - seg_len = len(audio_segment) - + raw_data = _convert_to_numpy(audio_segment) + min_silence_ms = min_silence_len + silence_threshold_db = silence_thresh + seek_step_ms = seek_step + + seg_len_ms = len(audio_segment) + frames_per_ms = audio_segment.frame_rate / 1000 + + assert raw_data.shape[0] == audio_segment.frame_count() + assert raw_data.shape[1] == audio_segment.channels + + max_frames_in_slice = int(np.ceil(min_silence_ms * frames_per_ms)) + + # determine number of frames in computation window buffer + if max_buffer_size_kb >= 0: + bytes_per_frame = 8 + frames_per_kb = 1024 // bytes_per_frame + buffer_len = max_buffer_size_kb * frames_per_kb + # empirical testing shows that we need approximately 4 times as much memory + # as buffer_len would suggest, probably because numpy allocates additional buffers + # in the background during computations; we correct for this by adjusting buffer_len accordingly + correction_constant = 4 + buffer_len //= correction_constant + if buffer_len < max_frames_in_slice: + min_buffer_size = int(np.ceil(max_frames_in_slice * bytes_per_frame / 1024)) * correction_constant + raise ValueError("Buffer is too small, must be at least {} for the given {}" % min_buffer_size, min_silence_ms) + else: + buffer_len = len(raw_data) # no restrictions! + + # you can't have a silent portion of a sound that is longer than the sound - if seg_len < min_silence_len: + if seg_len_ms < min_silence_ms: return [] - + # convert silence threshold to a float value (so we can compare it to rms) - silence_thresh = db_to_float(silence_thresh) * audio_segment.max_possible_amplitude - - # find silence and add start and end indicies to the to_cut list - silence_starts = [] + normalization_const = float(audio_segment.max_possible_amplitude) + # normalization_const = 1. + silence_thresh = db_to_float(silence_threshold_db) * audio_segment.max_possible_amplitude / normalization_const # check successive (1 sec by default) chunk of sound for silence # try a chunk at every "seek step" (or every chunk for a seek step == 1) - last_slice_start = seg_len - min_silence_len - slice_starts = range(0, last_slice_start + 1, seek_step) + last_slice_start = seg_len_ms - min_silence_ms + slice_starts = range(0, last_slice_start + 1, seek_step_ms) # guarantee last_slice_start is included in the range # to make sure the last portion of the audio is searched - if last_slice_start % seek_step: + if last_slice_start % seek_step_ms: slice_starts = itertools.chain(slice_starts, [last_slice_start]) - for i in slice_starts: - audio_slice = audio_segment[i:i + min_silence_len] - if audio_slice.rms <= silence_thresh: - silence_starts.append(i) - - # short circuit when there is no silence - if not silence_starts: - return [] - - # combine the silence we detected into ranges (start ms - end ms) + # list of all continuous regions of silence (start ms - end ms) silent_ranges = [] - - prev_i = silence_starts.pop(0) - current_range_start = prev_i - - for silence_start_i in silence_starts: - continuous = (silence_start_i == prev_i + seek_step) - - # sometimes two small blips are enough for one particular slice to be - # non-silent, despite the silence all running together. Just combine - # the two overlapping silent ranges. - silence_has_gap = silence_start_i > (prev_i + min_silence_len) - - if not continuous and silence_has_gap: - silent_ranges.append([current_range_start, - prev_i + min_silence_len]) - current_range_start = silence_start_i - prev_i = silence_start_i - - silent_ranges.append([current_range_start, - prev_i + min_silence_len]) + + prev_silent_ms = None + current_range_start = None + + # load first window into buffer + # the cumsq_per_frame buffer holds the cumulative sum of means of squares over channels for each frame, + # normalized by max_possible_amplitude (i.e., all possible values x are 0 <= x <= 1) - this is + # to prevent cumulative sums of square from exceeding representable values + cumsq_per_frame = np.concatenate(( + [0.], + np.cumsum( + np.mean((raw_data[0:buffer_len].astype(np.float64))**2, axis=-1) + / (normalization_const**2) + ) + )) + # keep track of the frames currently in the buffer + buffer_offset = 0 + buffer_end = buffer_len + + for slice_start_ms in slice_starts: + slice_start = int(slice_start_ms * frames_per_ms) + slice_end = min(int((slice_start_ms + min_silence_ms) * frames_per_ms), len(raw_data)) + assert slice_end <= len(raw_data) + # if the frame_rate is not divisible by min_silence_ms, we may have frames with slightly varying lengths + # so we compute the actual length of the concrete slice here + frames_in_slice = slice_end - slice_start + + if slice_end > buffer_end: # we ran out of buffer; load next window into buffer + cumsq_per_frame = np.concatenate(( + [0.], + np.cumsum( + np.mean( + (raw_data[slice_start:slice_start + buffer_len].astype(np.float64))**2, + axis=-1 + ) / (normalization_const**2) + ) + + )) + buffer_offset = slice_start + buffer_end = buffer_offset + buffer_len + + # compute the RMS for the current slice from the cumulative sums of squares in the buffer + slice_msq = cumsq_per_frame[slice_end - buffer_offset] - cumsq_per_frame[slice_start - buffer_offset] + rms = np.sqrt(slice_msq / frames_in_slice) + + if rms <= silence_thresh: + # silence_starts.append(slice_start_ms) + # current slice is silent; combine with preceeding silent slice if no nonsilent gap + if current_range_start is None: + current_range_start = slice_start_ms + else: + continuous = (slice_start_ms == prev_silent_ms + seek_step) + + # sometimes two small blips are enough for one particular slice to be + # non-silent, despite the silence all running together. Just combine + # the two overlapping silent ranges. + silence_has_gap = slice_start_ms > (prev_silent_ms + min_silence_len) + + if not continuous and silence_has_gap: + silent_ranges.append([ + current_range_start, + prev_silent_ms + min_silence_len + ]) + current_range_start = slice_start_ms + + prev_silent_ms = slice_start_ms + + if current_range_start is not None: + assert prev_silent_ms is not None + silent_ranges.append([current_range_start, + prev_silent_ms + min_silence_len]) return silent_ranges diff --git a/setup.py b/setup.py index 7a2fc152..dba36922 100644 --- a/setup.py +++ b/setup.py @@ -16,6 +16,9 @@ keywords='audio sound high-level', url='http://pydub.com', packages=['pydub'], + install_requires=[ + "numpy >= 1.16, < 2.0", + ], long_description=__doc__, classifiers=[ 'Development Status :: 5 - Production/Stable', diff --git a/test/test.py b/test/test.py index 1965761f..2694f0fa 100644 --- a/test/test.py +++ b/test/test.py @@ -1153,6 +1153,46 @@ def test_realistic_audio(self): self.assertTrue(start > prev_end) prev_end = end + def test_detect_silence_noncontinous_without_gap(self): + raw_data = b"\xFF\x7F\xFF\x7F\x00\x00\x00\x00\xFF\x7F\x00\x00\xFF\x7F\x00\x00\x00\x00\xFF\x7F\xFF\x7F" + seg = AudioSegment(raw_data, frame_rate=1000, channels=1, sample_width=2) + + self.assertEqual(seg.frame_count(), 11.0) + self.assertEqual(len(seg), 11) + + silent_ranges = detect_silence(seg, min_silence_len=3, silence_thresh=-4, seek_step=1) + self.assertEqual(silent_ranges, [[1, 10]]) + + def test_detect_silence_noncontinuous_with_gap(self): + raw_data = b"\xFF\x7F\xFF\x7F\x00\x00\x00\x00\xFF\x7F\xFF\x7F\xFF\x7F\x00\x00\x00\x00\xFF\x7F\xFF\x7F" + seg = AudioSegment(raw_data, frame_rate=1000, channels=1, sample_width=2) + + self.assertEqual(seg.frame_count(), 11.0) + self.assertEqual(len(seg), 11) + + silent_ranges = detect_silence(seg, min_silence_len=3, silence_thresh=-4, seek_step=1) + self.assertEqual(silent_ranges, [[1, 5], [6, 10]]) + + def test_detect_silence_noncontinuous_without_gap_last_slice(self): + raw_data = b"\xFF\x7F\xFF\x7F\x00\x00\x00\x00\xFF\x7F\xFF\x7F\x00\x00\x00\x00" + seg = AudioSegment(raw_data, frame_rate=1000, channels=1, sample_width=2) + + self.assertEqual(seg.frame_count(), 8.0) + self.assertEqual(len(seg), 8) + + silent_ranges = detect_silence(seg, min_silence_len=3, silence_thresh=-4, seek_step=2) + self.assertEqual(silent_ranges, [[2, 8]]) + + def test_detect_silence_noncontinuous_with_gap_last_slice(self): + raw_data = b"\xFF\x7F\xFF\x7F\x00\x00\x00\x00\xFF\x7F\xFF\x7F\xFF\x7F\x00\x00\xFF\x7F\x00\x00" + seg = AudioSegment(raw_data, frame_rate=1000, channels=1, sample_width=2) + + self.assertEqual(seg.frame_count(), 10.0) + self.assertEqual(len(seg), 10) + + silent_ranges = detect_silence(seg, min_silence_len=3, silence_thresh=-4, seek_step=2) + self.assertEqual(silent_ranges, [[2, 5], [7, 10]]) + class GeneratorTests(unittest.TestCase):