How do I only decode audio but mux video from an rtsp source? #862

marawan31 · 2020-05-29T22:32:11Z

marawan31
May 29, 2020

Overview

So basically I want to decode audio then resample/encode it back to be muxed, while also muxing video. I am trying to split a video into multiple mpegts chunks.
The problem I'm facing is that when I resample/encode the audio, the result is all choppy and weird.
If I only mux the audio instead of resampling it, everything is perfect.

Technically I save the chunk on every keyframe but it could be on any number of frames, as I understand it, it just needs to start on a keyframe. As it turns out the example below is perfectly a keyframe per 2 seconds so i save 2 seconds chunks.

This code can directly be used by simply replacing the output_path variable at the top to a valid path.
This code only creates 5 chunks for demonstration purposes:


import io
import attr
import av


output_path = "/some/path" # without the ending '/'
audio_sample_rate = 44100
rtsp_url = "rtsp://wowzaec2demo.streamlock.net/vod/mp4:BigBuckBunny_115k.mov"

@attr.s
class Buffer:
    chunk = attr.ib(type=io.BytesIO) 
    container = attr.ib()
    vstream = attr.ib()
    astream = attr.ib(default=None)

def save_chunk(segment_data, segment_index):
    f = open(f"{output_path}/mpgts_{str(segment_index)}.ts", 'wb')
    f.write(segment_data.getvalue())
    f.close()

def create_output(video_stream, audio_steam):
    chunk = io.BytesIO()
    output = av.open(chunk, mode="w", format="mpegts")
    astream = None
    if audio_steam is not None:
        astream = output.add_stream(codec_name="aac", rate=audio_sample_rate)
    vstream = output.add_stream(template=video_stream)
    return Buffer(chunk, output, vstream, astream)

def main():
    options = {"rtsp_flags": "prefer_tcp"}
    container = av.open(rtsp_url, options=options)
    audio_stream = None
    video_stream = None
    try:
        video_stream = container.streams.video[0]
        audio_stream = container.streams.audio[0]
    except (KeyError, IndexError):
        return

    segment_count = 0
    first_audio_packet = True
    first_audio_pts = 0
    first_video_packet = True
    first_video_pts = 0
    first_video_dts = 0
    output = None
    resampler = av.AudioResampler(audio_stream.format, audio_stream.layout, audio_sample_rate)

    while True:
    
        packet = next(container.demux(video_stream, audio_stream))
        if packet is None:
            break

        if packet.dts is None:
            continue

        if packet.stream.type == 'video':
            if first_video_packet:
                first_video_pts = packet.pts
                first_video_dts = packet.dts
                first_video_packet = False
            if packet.is_keyframe:
                if output is not None:
                    output.container.close()
                    save_chunk(output.chunk, segment_count)
                    output.chunk.close()
                    segment_count += 1
                if segment_count >= 5:
                    break
                output = create_output(video_stream, audio_stream)

            packet.pts -= first_video_pts
            packet.dts -= first_video_dts

            if output is not None and output.vstream is not None:
                print(f"video_packet: {str(packet)}, dur: {str(packet.duration)}")
                packet.stream = output.vstream
                output.container.mux(packet)
        else:
            if output is None:
                continue
            if first_audio_packet:
                first_audio_pts = packet.pts
                first_audio_packet = False
            packet.pts -= first_audio_pts
            packet.dts -= first_audio_pts
            print(f"audio_packet: pts={str(packet.pts)} time_base={str(packet.time_base)}, dur: {str(packet.duration)}")
            for a_frame in packet.decode():
                print(f"original a_frame: {str(a_frame)}, samples: {str(a_frame.samples)}, time_base={a_frame.time_base}")
                sampled_frame = resampler.resample(a_frame)
                print(f"resampled sampled_frame: {str(sampled_frame)}, samples: {str(sampled_frame.samples)}, time_base={sampled_frame.time_base}")
                sampled_frame.pts = None
                a_packets = output.astream.encode(sampled_frame)
                for a_packet in a_packets:
                    a_packet.stream = output.astream
                    print(f"a_packet: pts={str(a_packet.pts)} time_base={str(a_packet.time_base)}, dur: {str(a_packet.duration)}, stream tb: {str(output.astream.time_base)}")
                    output.container.mux(a_packet)

    container.close()

main()

Expected behavior

The video should play smoothly without hiccups.

Actual behavior

The playback is choppy even the video part (not just audio).

Investigation

I've tried all sorts of stuff:

Instead of creating my own Resampler, directly use the encode function of the audio output by splitting the demux function which is given both audio and video (in the example) to decode for audio and demux for video: same result
Calculate the audio pts manually using the packet duration and incrementing (this is not perfect as video can go ahead of audio if there are any dropped frames): same result
Calculate the audio pts based off of the last video pts: same result
Set all pts/dts sent to encode to None: same result
Set None to pts/dts for resample/encode/mux: same result

Research

I have done the following:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

How do I only decode audio but mux video from an rtsp source? #862

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 0 comments

Select a reply

How do I only decode audio but mux video from an rtsp source? #862

marawan31 May 29, 2020

Overview

Expected behavior

Actual behavior

Investigation

Research

Replies: 0 comments

marawan31
May 29, 2020