package gopeg /* #cgo CFLAGS: -I${SRCDIR}/vendor/include #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/vendor/linux_amd64 -lavformat -lavcodec -lavutil -lswresample -ldav1d -lm -lz -lpthread -ldrm #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/vendor/linux_arm64 -lavformat -lavcodec -lavutil -lswresample -ldav1d -lm -lz -lpthread -ldrm #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/vendor/windows_amd64 -lavformat -lavcodec -lavutil -lswresample -ldav1d -lm -lz #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/vendor/darwin_arm64 -lavformat -lavcodec -lavutil -lswresample -ldav1d -lm -lz #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/vendor/darwin_amd64 -lavformat -lavcodec -lavutil -lswresample -ldav1d -lm -lz #include #include #include #include #include #include #include #include #include extern int goReadPacket(void *opaque, uint8_t *buf, int buf_size); extern int64_t goSeek(void *opaque, int64_t offset, int whence); static AVIOContext* alloc_avio_context(void *opaque) { const int bufSize = 65536; uint8_t *buf = (uint8_t*)av_malloc(bufSize); if (!buf) return NULL; return avio_alloc_context(buf, bufSize, 0, opaque, goReadPacket, NULL, goSeek); } extern enum AVPixelFormat get_sw_format(AVCodecContext *ctx, const enum AVPixelFormat *pix_fmts); */ import "C" import ( "errors" "fmt" "image" "io" "runtime/cgo" "time" "unsafe" ) // VideoMeta describes the video stream found in the media source. type VideoMeta struct { // Width is the encoded frame width in pixels. Width int // Height is the encoded frame height in pixels. Height int // FPSNum is the frame-rate numerator. FPSNum int // FPSDen is the frame-rate denominator. FPSDen int // CodecName is the FFmpeg codec identifier // (for example: "h264", "hevc", "vp9", "av1"). CodecName string } // AudioMeta describes the audio stream found in the media source. type AudioMeta struct { // SampleRate is the stream sample rate in Hz. SampleRate int // Channels is the number of audio channels. Channels int // CodecName is the FFmpeg codec identifier // (for example: "aac", "opus", "mp3"). CodecName string } // Meta contains container-level media information. type Meta struct { // Duration is the total media duration. Duration time.Duration // Video contains video stream information. // Nil if the source has no video stream. Video *VideoMeta // Audio contains audio stream information. // Nil if the source has no audio stream. Audio *AudioMeta } // VideoFrame represents a decoded video frame. type VideoFrame struct { // Img contains the decoded image in NRGBA format. Img *image.NRGBA // PTS is the presentation timestamp relative // to the beginning of the stream. PTS time.Duration // Duration is the display duration of the frame. Duration time.Duration } // AudioFrame represents decoded PCM audio samples. type AudioFrame struct { // Samples contains interleaved float32 PCM samples. Samples []float32 // Channels is the number of channels represented // in Samples. Channels int // SampleRate is the audio sample rate in Hz. SampleRate int // PTS is the presentation timestamp relative // to the beginning of the stream. PTS time.Duration } // Frame represents either a decoded video frame // or a decoded audio frame. type Frame struct { // Video is populated when the decoded frame // contains video data. Video *VideoFrame // Audio is populated when the decoded frame // contains audio data. Audio *AudioFrame } // Decoder provides sequential decoding of audio // and video frames from an io.ReadSeeker source. type Decoder struct { handle cgo.Handle fmtCtx *C.AVFormatContext videoIdx C.int audioIdx C.int vCodecCtx *C.AVCodecContext aCodecCtx *C.AVCodecContext swrCtx *C.SwrContext avFrame *C.AVFrame avPkt *C.AVPacket avioCtx *C.AVIOContext meta Meta } type readerState struct { r io.Reader s io.Seeker } //export goReadPacket func goReadPacket(opaque unsafe.Pointer, buf *C.uint8_t, bufSize C.int) C.int { h := *(*cgo.Handle)(opaque) rs := h.Value().(*readerState) b := make([]byte, int(bufSize)) n, err := rs.r.Read(b) if n > 0 { C.memcpy(unsafe.Pointer(buf), unsafe.Pointer(&b[0]), C.size_t(n)) } if err != nil { if errors.Is(err, io.EOF) { return C.int(C.AVERROR_EOF) } return -1 } return C.int(n) } //export goSeek func goSeek(opaque unsafe.Pointer, offset C.int64_t, whence C.int) C.int64_t { h := *(*cgo.Handle)(opaque) rs := h.Value().(*readerState) if rs.s == nil { return -1 } switch int(whence) { case 0x10000: if sz, ok := rs.r.(interface{ Size() int64 }); ok { return C.int64_t(sz.Size()) } return -1 } var seekWhence int switch int(whence) { case 0: seekWhence = io.SeekStart case 1: seekWhence = io.SeekCurrent case 2: seekWhence = io.SeekEnd default: return -1 } n, err := rs.s.Seek(int64(offset), seekWhence) if err != nil { return -1 } return C.int64_t(n) } // NewDecoder creates a decoder that reads media data // from r using FFmpeg custom I/O. // // If r also implements io.Seeker, FFmpeg may perform // random access operations when required by the // container format. // // When seeking is unavailable: // // - decoding remains fully supported // - metadata extraction may be less accurate // - reported duration may be unavailable // - some container formats may not expose all stream // information until more data has been read // // Typical streaming sources such as HTTP responses, // SSH pipes and stdin are supported. func NewDecoder(r io.Reader) (*Decoder, error) { d := &Decoder{ videoIdx: -1, audioIdx: -1, } rs := &readerState{ r: r, } if s, ok := r.(io.Seeker); ok { rs.s = s } d.handle = cgo.NewHandle(rs) handlePtr := (*C.uchar)(unsafe.Pointer(&d.handle)) avioCtx := C.alloc_avio_context(unsafe.Pointer(handlePtr)) if avioCtx == nil { d.handle.Delete() return nil, fmt.Errorf("avio_alloc_context failed") } d.avioCtx = avioCtx d.fmtCtx = C.avformat_alloc_context() if d.fmtCtx == nil { C.av_free(unsafe.Pointer(avioCtx)) d.handle.Delete() return nil, fmt.Errorf("avformat_alloc_context failed") } d.fmtCtx.pb = avioCtx d.fmtCtx.flags |= C.AVFMT_FLAG_CUSTOM_IO C.av_log_set_level(C.AV_LOG_QUIET) ret := C.avformat_open_input(&d.fmtCtx, nil, nil, nil) if ret < 0 { d.handle.Delete() return nil, fmt.Errorf("avformat_open_input: %d", ret) } ret = C.avformat_find_stream_info(d.fmtCtx, nil) if ret < 0 { C.avformat_close_input(&d.fmtCtx) d.handle.Delete() return nil, fmt.Errorf("avformat_find_stream_info: %d", ret) } for i := C.uint(0); i < d.fmtCtx.nb_streams; i++ { stream := *(**C.AVStream)(unsafe.Add( unsafe.Pointer(d.fmtCtx.streams), uintptr(i)*unsafe.Sizeof(uintptr(0))), ) switch stream.codecpar.codec_type { case C.AVMEDIA_TYPE_VIDEO: if d.videoIdx < 0 { d.videoIdx = C.int(i) } case C.AVMEDIA_TYPE_AUDIO: if d.audioIdx < 0 { d.audioIdx = C.int(i) } } } if d.videoIdx >= 0 { if err := d.openVideoCodec(); err != nil { d.Close() return nil, err } } if d.audioIdx >= 0 { if err := d.openAudioCodec(); err != nil { d.Close() return nil, err } } d.avFrame = C.av_frame_alloc() d.avPkt = C.av_packet_alloc() d.buildMeta() return d, nil } func (d *Decoder) openVideoCodec() error { stream := d.stream(d.videoIdx) var codec *C.AVCodec switch stream.codecpar.codec_id { case C.AV_CODEC_ID_AV1: name := C.CString("libdav1d") codec = C.avcodec_find_decoder_by_name(name) C.free(unsafe.Pointer(name)) if codec == nil { name = C.CString("av1") codec = C.avcodec_find_decoder_by_name(name) C.free(unsafe.Pointer(name)) } default: codec = C.avcodec_find_decoder(stream.codecpar.codec_id) } if codec == nil { return fmt.Errorf("no video decoder for codec_id %d", stream.codecpar.codec_id) } d.vCodecCtx = C.avcodec_alloc_context3(codec) if d.vCodecCtx == nil { return fmt.Errorf("avcodec_alloc_context3 video failed") } if ret := C.avcodec_parameters_to_context(d.vCodecCtx, stream.codecpar); ret < 0 { return fmt.Errorf("avcodec_parameters_to_context video: %d", ret) } d.vCodecCtx.get_format = (*[0]byte)(unsafe.Pointer(C.get_sw_format)) d.vCodecCtx.hwaccel = nil d.vCodecCtx.hwaccel_context = nil if ret := C.avcodec_open2(d.vCodecCtx, codec, nil); ret < 0 { return fmt.Errorf("avcodec_open2 video: %d", ret) } return nil } func (d *Decoder) openAudioCodec() error { stream := d.stream(d.audioIdx) codec := C.avcodec_find_decoder(stream.codecpar.codec_id) if codec == nil { return fmt.Errorf("no audio decoder for codec_id %d", stream.codecpar.codec_id) } d.aCodecCtx = C.avcodec_alloc_context3(codec) if d.aCodecCtx == nil { return fmt.Errorf("avcodec_alloc_context3 audio failed") } if ret := C.avcodec_parameters_to_context(d.aCodecCtx, stream.codecpar); ret < 0 { return fmt.Errorf("avcodec_parameters_to_context audio: %d", ret) } if ret := C.avcodec_open2(d.aCodecCtx, codec, nil); ret < 0 { return fmt.Errorf("avcodec_open2 audio: %d", ret) } d.swrCtx = C.swr_alloc() if d.swrCtx == nil { return fmt.Errorf("swr_alloc failed") } inLayout := C.CString("in_chlayout") outLayout := C.CString("out_chlayout") inRate := C.CString("in_sample_rate") outRate := C.CString("out_sample_rate") inFmt := C.CString("in_sample_fmt") outFmt := C.CString("out_sample_fmt") defer func() { C.free(unsafe.Pointer(inLayout)) C.free(unsafe.Pointer(outLayout)) C.free(unsafe.Pointer(inRate)) C.free(unsafe.Pointer(outRate)) C.free(unsafe.Pointer(inFmt)) C.free(unsafe.Pointer(outFmt)) }() C.av_opt_set_chlayout(unsafe.Pointer(d.swrCtx), inLayout, &d.aCodecCtx.ch_layout, 0) C.av_opt_set_chlayout(unsafe.Pointer(d.swrCtx), outLayout, &d.aCodecCtx.ch_layout, 0) C.av_opt_set_int(unsafe.Pointer(d.swrCtx), inRate, C.int64_t(d.aCodecCtx.sample_rate), 0) C.av_opt_set_int(unsafe.Pointer(d.swrCtx), outRate, C.int64_t(d.aCodecCtx.sample_rate), 0) C.av_opt_set_int(unsafe.Pointer(d.swrCtx), inFmt, C.int64_t(d.aCodecCtx.sample_fmt), 0) C.av_opt_set_int(unsafe.Pointer(d.swrCtx), outFmt, C.int64_t(C.AV_SAMPLE_FMT_FLTP), 0) if ret := C.swr_init(d.swrCtx); ret < 0 { return fmt.Errorf("swr_init: %d", ret) } return nil } func (d *Decoder) stream(idx C.int) *C.AVStream { return *(**C.AVStream)(unsafe.Pointer( uintptr(unsafe.Pointer(d.fmtCtx.streams)) + uintptr(idx)*unsafe.Sizeof(uintptr(0)), )) } func (d *Decoder) buildMeta() { d.meta = Meta{ Duration: time.Duration( float64(d.fmtCtx.duration) * float64(time.Second) / float64(C.AV_TIME_BASE), ), } if d.videoIdx >= 0 { s := d.stream(d.videoIdx) d.meta.Video = &VideoMeta{ Width: int(s.codecpar.width), Height: int(s.codecpar.height), FPSNum: int(s.avg_frame_rate.num), FPSDen: int(s.avg_frame_rate.den), CodecName: C.GoString(C.avcodec_get_name(s.codecpar.codec_id)), } } if d.audioIdx >= 0 { s := d.stream(d.audioIdx) d.meta.Audio = &AudioMeta{ SampleRate: int(s.codecpar.sample_rate), Channels: int(s.codecpar.ch_layout.nb_channels), CodecName: C.GoString(C.avcodec_get_name(s.codecpar.codec_id)), } } } // NewDecoder creates a decoder that reads media data // from r using FFmpeg custom I/O. // // If r also implements io.Seeker, FFmpeg may perform // random access operations when required by the // container format. // // When seeking is unavailable: // // - decoding remains fully supported // - metadata extraction may be less accurate // - reported duration may be unavailable // - some container formats may not expose all stream // information until more data has been read // // Typical streaming sources such as HTTP responses, // SSH pipes and stdin are supported. func (d *Decoder) Meta() Meta { return d.meta } // DecodeFrame decodes the next available frame. // // DecodeFrame works with both seekable and // non-seekable inputs. // // For streaming inputs FFmpeg processes data // incrementally as it becomes available. func (d *Decoder) DecodeFrame() (*Frame, error) { for { ret := C.av_read_frame(d.fmtCtx, d.avPkt) if ret == C.int(C.AVERROR_EOF) { return nil, nil } if ret < 0 { return nil, fmt.Errorf("av_read_frame: %d", ret) } streamIdx := C.int(d.avPkt.stream_index) var frame *Frame var err error switch streamIdx { case d.videoIdx: frame, err = d.decodeVideo() case d.audioIdx: frame, err = d.decodeAudio() } C.av_packet_unref(d.avPkt) if err != nil { return nil, err } if frame != nil { return frame, nil } } } func (d *Decoder) decodeVideo() (*Frame, error) { ret := C.avcodec_send_packet(d.vCodecCtx, d.avPkt) if ret < 0 { return nil, fmt.Errorf("avcodec_send_packet video: %d", ret) } ret = C.avcodec_receive_frame(d.vCodecCtx, d.avFrame) if ret == -C.EAGAIN || ret == C.int(C.AVERROR_EOF) { return nil, nil } if ret < 0 { return nil, fmt.Errorf("avcodec_receive_frame video: %d", ret) } defer C.av_frame_unref(d.avFrame) img, err := frameToNRGBA(d.avFrame) if err != nil { return nil, err } stream := d.stream(d.videoIdx) tb := stream.time_base pts := streamTimeToDuration(int64(d.avFrame.pts), tb.num, tb.den) dur := streamTimeToDuration(int64(d.avFrame.duration), tb.num, tb.den) return &Frame{Video: &VideoFrame{Img: img, PTS: pts, Duration: dur}}, nil } func (d *Decoder) decodeAudio() (*Frame, error) { ret := C.avcodec_send_packet(d.aCodecCtx, d.avPkt) if ret < 0 { return nil, fmt.Errorf("avcodec_send_packet audio: %d", ret) } ret = C.avcodec_receive_frame(d.aCodecCtx, d.avFrame) if ret == -C.EAGAIN || ret == C.int(C.AVERROR_EOF) { return nil, nil } if ret < 0 { return nil, fmt.Errorf("avcodec_receive_frame audio: %d", ret) } defer C.av_frame_unref(d.avFrame) nSamples := int(d.avFrame.nb_samples) nChannels := int(d.aCodecCtx.ch_layout.nb_channels) outBuf := make([]float32, nSamples*nChannels) outPtr := (**C.uint8_t)(unsafe.Pointer(&outBuf[0])) outSamples := C.swr_convert(d.swrCtx, outPtr, C.int(nSamples), (**C.uint8_t)(unsafe.Pointer(&d.avFrame.data[0])), C.int(nSamples)) if outSamples < 0 { return nil, fmt.Errorf("swr_convert: %d", outSamples) } stream := d.stream(d.audioIdx) tb := stream.time_base pts := streamTimeToDuration(int64(d.avFrame.pts), tb.num, tb.den) return &Frame{Audio: &AudioFrame{ Samples: outBuf[:int(outSamples)*nChannels], Channels: nChannels, SampleRate: int(d.aCodecCtx.sample_rate), PTS: pts, }}, nil } func frameToNRGBA(f *C.AVFrame) (*image.NRGBA, error) { w := int(f.width) h := int(f.height) img := image.NewNRGBA(image.Rect(0, 0, w, h)) if f.format == C.int(C.AV_PIX_FMT_YUV420P) { yPlane := unsafe.Slice((*byte)(unsafe.Pointer(f.data[0])), int(f.linesize[0])*h) uPlane := unsafe.Slice((*byte)(unsafe.Pointer(f.data[1])), int(f.linesize[1])*h/2) vPlane := unsafe.Slice((*byte)(unsafe.Pointer(f.data[2])), int(f.linesize[2])*h/2) yStride := int(f.linesize[0]) uStride := int(f.linesize[1]) for row := range h { for col := range w { yv := float64(yPlane[row*yStride+col]) uv := float64(uPlane[(row/2)*uStride+col/2]) - 128 vv := float64(vPlane[(row/2)*uStride+col/2]) - 128 off := img.PixOffset(col, row) img.Pix[off+0] = clamp(yv + 1.402*vv) img.Pix[off+1] = clamp(yv - 0.344136*uv - 0.714136*vv) img.Pix[off+2] = clamp(yv + 1.772*uv) img.Pix[off+3] = 255 } } return img, nil } return nil, fmt.Errorf("unsupported pixel format: %d", int(f.format)) } func clamp(v float64) uint8 { if v < 0 { return 0 } if v > 255 { return 255 } return uint8(v) } func streamTimeToDuration(value int64, num C.int, den C.int) time.Duration { if den == 0 { return 0 } seconds := float64(value) * float64(num) / float64(den) return time.Duration(seconds * float64(time.Second)) } // Close releases all FFmpeg resources owned by // the decoder. // // It is safe to call Close multiple times. func (d *Decoder) Close() { if d.avFrame != nil { C.av_frame_free(&d.avFrame) } if d.avPkt != nil { C.av_packet_free(&d.avPkt) } if d.swrCtx != nil { C.swr_free(&d.swrCtx) } if d.vCodecCtx != nil { C.avcodec_free_context(&d.vCodecCtx) } if d.aCodecCtx != nil { C.avcodec_free_context(&d.aCodecCtx) } if d.fmtCtx != nil { C.avformat_close_input(&d.fmtCtx) } if d.avioCtx != nil { C.avio_context_free(&d.avioCtx) } if d.handle != 0 { d.handle.Delete() d.handle = 0 } }