almost complete implementation

2026-06-07 00:40:27 +03:00
parent ac1015d603
commit ad945bbbaa
175 changed files with 38200 additions and 1 deletions
@@ -0,0 +1,635 @@
+package gopeg
+
+/*
+#cgo CFLAGS: -I${SRCDIR}/vendor/include
+#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/vendor/linux_amd64 -lavformat -lavcodec -lavutil -lswresample -ldav1d -lm -lz -lpthread -ldrm
+#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/vendor/linux_arm64 -lavformat -lavcodec -lavutil -lswresample -ldav1d -lm -lz -lpthread -ldrm
+
+#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/vendor/windows_amd64 -lavformat -lavcodec -lavutil -lswresample -ldav1d -lm -lz
+
+#cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/vendor/darwin_arm64 -lavformat -lavcodec -lavutil -lswresample -ldav1d -lm -lz
+#cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/vendor/darwin_amd64 -lavformat -lavcodec -lavutil -lswresample -ldav1d -lm -lz
+
+#include <libavformat/avformat.h>
+#include <libavcodec/avcodec.h>
+#include <libavutil/imgutils.h>
+#include <libavutil/opt.h>
+#include <libswresample/swresample.h>
+#include <stdlib.h>
+#include <string.h>
+#include <libavutil/pixfmt.h>
+#include <libavutil/channel_layout.h>
+
+extern int goReadPacket(void *opaque, uint8_t *buf, int buf_size);
+extern int64_t goSeek(void *opaque, int64_t offset, int whence);
+
+static AVIOContext* alloc_avio_context(void *opaque) {
+    const int bufSize = 65536;
+    uint8_t *buf = (uint8_t*)av_malloc(bufSize);
+    if (!buf) return NULL;
+    return avio_alloc_context(buf, bufSize, 0, opaque, goReadPacket, NULL, goSeek);
+}
+
+extern enum AVPixelFormat get_sw_format(AVCodecContext *ctx, const enum AVPixelFormat *pix_fmts);
+*/
+import "C"
+
+import (
+	"errors"
+	"fmt"
+	"image"
+	"io"
+	"runtime/cgo"
+	"time"
+	"unsafe"
+)
+
+// VideoMeta describes the video stream found in the media source.
+type VideoMeta struct {
+	// Width is the encoded frame width in pixels.
+	Width int
+
+	// Height is the encoded frame height in pixels.
+	Height int
+
+	// FPSNum is the frame-rate numerator.
+	FPSNum int
+
+	// FPSDen is the frame-rate denominator.
+	FPSDen int
+
+	// CodecName is the FFmpeg codec identifier
+	// (for example: "h264", "hevc", "vp9", "av1").
+	CodecName string
+}
+
+// AudioMeta describes the audio stream found in the media source.
+type AudioMeta struct {
+	// SampleRate is the stream sample rate in Hz.
+	SampleRate int
+
+	// Channels is the number of audio channels.
+	Channels int
+
+	// CodecName is the FFmpeg codec identifier
+	// (for example: "aac", "opus", "mp3").
+	CodecName string
+}
+
+// Meta contains container-level media information.
+type Meta struct {
+	// Duration is the total media duration.
+	Duration time.Duration
+
+	// Video contains video stream information.
+	// Nil if the source has no video stream.
+	Video *VideoMeta
+
+	// Audio contains audio stream information.
+	// Nil if the source has no audio stream.
+	Audio *AudioMeta
+}
+
+// VideoFrame represents a decoded video frame.
+type VideoFrame struct {
+	// Img contains the decoded image in NRGBA format.
+	Img *image.NRGBA
+
+	// PTS is the presentation timestamp relative
+	// to the beginning of the stream.
+	PTS time.Duration
+
+	// Duration is the display duration of the frame.
+	Duration time.Duration
+}
+
+// AudioFrame represents decoded PCM audio samples.
+type AudioFrame struct {
+	// Samples contains interleaved float32 PCM samples.
+	Samples []float32
+
+	// Channels is the number of channels represented
+	// in Samples.
+	Channels int
+
+	// SampleRate is the audio sample rate in Hz.
+	SampleRate int
+
+	// PTS is the presentation timestamp relative
+	// to the beginning of the stream.
+	PTS time.Duration
+}
+
+// Frame represents either a decoded video frame
+// or a decoded audio frame.
+type Frame struct {
+	// Video is populated when the decoded frame
+	// contains video data.
+	Video *VideoFrame
+
+	// Audio is populated when the decoded frame
+	// contains audio data.
+	Audio *AudioFrame
+}
+
+// Decoder provides sequential decoding of audio
+// and video frames from an io.ReadSeeker source.
+type Decoder struct {
+	handle    cgo.Handle
+	fmtCtx    *C.AVFormatContext
+	videoIdx  C.int
+	audioIdx  C.int
+	vCodecCtx *C.AVCodecContext
+	aCodecCtx *C.AVCodecContext
+	swrCtx    *C.SwrContext
+	avFrame   *C.AVFrame
+	avPkt     *C.AVPacket
+	avioCtx   *C.AVIOContext
+	meta      Meta
+}
+
+type readerState struct {
+	r io.Reader
+	s io.Seeker
+}
+
+//export goReadPacket
+func goReadPacket(opaque unsafe.Pointer, buf *C.uint8_t, bufSize C.int) C.int {
+	h := *(*cgo.Handle)(opaque)
+	rs := h.Value().(*readerState)
+	b := make([]byte, int(bufSize))
+	n, err := rs.r.Read(b)
+	if n > 0 {
+		C.memcpy(unsafe.Pointer(buf), unsafe.Pointer(&b[0]), C.size_t(n))
+	}
+	if err != nil {
+		if errors.Is(err, io.EOF) {
+			return C.int(C.AVERROR_EOF)
+		}
+		return -1
+	}
+	return C.int(n)
+}
+
+//export goSeek
+func goSeek(opaque unsafe.Pointer, offset C.int64_t, whence C.int) C.int64_t {
+	h := *(*cgo.Handle)(opaque)
+	rs := h.Value().(*readerState)
+
+	if rs.s == nil {
+		return -1
+	}
+
+	switch int(whence) {
+	case 0x10000:
+		if sz, ok := rs.r.(interface{ Size() int64 }); ok {
+			return C.int64_t(sz.Size())
+		}
+		return -1
+	}
+	var seekWhence int
+	switch int(whence) {
+	case 0:
+		seekWhence = io.SeekStart
+	case 1:
+		seekWhence = io.SeekCurrent
+	case 2:
+		seekWhence = io.SeekEnd
+	default:
+		return -1
+	}
+	n, err := rs.s.Seek(int64(offset), seekWhence)
+	if err != nil {
+		return -1
+	}
+	return C.int64_t(n)
+}
+
+// NewDecoder creates a decoder that reads media data
+// from r using FFmpeg custom I/O.
+//
+// If r also implements io.Seeker, FFmpeg may perform
+// random access operations when required by the
+// container format.
+//
+// When seeking is unavailable:
+//
+//   - decoding remains fully supported
+//   - metadata extraction may be less accurate
+//   - reported duration may be unavailable
+//   - some container formats may not expose all stream
+//     information until more data has been read
+//
+// Typical streaming sources such as HTTP responses,
+// SSH pipes and stdin are supported.
+func NewDecoder(r io.Reader) (*Decoder, error) {
+	d := &Decoder{
+		videoIdx: -1,
+		audioIdx: -1,
+	}
+
+	rs := &readerState{
+		r: r,
+	}
+
+	if s, ok := r.(io.Seeker); ok {
+		rs.s = s
+	}
+
+	d.handle = cgo.NewHandle(rs)
+
+	handlePtr := (*C.uchar)(unsafe.Pointer(&d.handle))
+	avioCtx := C.alloc_avio_context(unsafe.Pointer(handlePtr))
+	if avioCtx == nil {
+		d.handle.Delete()
+		return nil, fmt.Errorf("avio_alloc_context failed")
+	}
+	d.avioCtx = avioCtx
+
+	d.fmtCtx = C.avformat_alloc_context()
+	if d.fmtCtx == nil {
+		C.av_free(unsafe.Pointer(avioCtx))
+		d.handle.Delete()
+		return nil, fmt.Errorf("avformat_alloc_context failed")
+	}
+	d.fmtCtx.pb = avioCtx
+	d.fmtCtx.flags |= C.AVFMT_FLAG_CUSTOM_IO
+	C.av_log_set_level(C.AV_LOG_QUIET)
+
+	ret := C.avformat_open_input(&d.fmtCtx, nil, nil, nil)
+	if ret < 0 {
+		d.handle.Delete()
+		return nil, fmt.Errorf("avformat_open_input: %d", ret)
+	}
+
+	ret = C.avformat_find_stream_info(d.fmtCtx, nil)
+	if ret < 0 {
+		C.avformat_close_input(&d.fmtCtx)
+		d.handle.Delete()
+		return nil, fmt.Errorf("avformat_find_stream_info: %d", ret)
+	}
+
+	for i := C.uint(0); i < d.fmtCtx.nb_streams; i++ {
+		stream := *(**C.AVStream)(unsafe.Add(
+			unsafe.Pointer(d.fmtCtx.streams),
+			uintptr(i)*unsafe.Sizeof(uintptr(0))),
+		)
+		switch stream.codecpar.codec_type {
+		case C.AVMEDIA_TYPE_VIDEO:
+			if d.videoIdx < 0 {
+				d.videoIdx = C.int(i)
+			}
+		case C.AVMEDIA_TYPE_AUDIO:
+			if d.audioIdx < 0 {
+				d.audioIdx = C.int(i)
+			}
+		}
+	}
+
+	if d.videoIdx >= 0 {
+		if err := d.openVideoCodec(); err != nil {
+			d.Close()
+			return nil, err
+		}
+	}
+	if d.audioIdx >= 0 {
+		if err := d.openAudioCodec(); err != nil {
+			d.Close()
+			return nil, err
+		}
+	}
+
+	d.avFrame = C.av_frame_alloc()
+	d.avPkt = C.av_packet_alloc()
+	d.buildMeta()
+	return d, nil
+}
+
+func (d *Decoder) openVideoCodec() error {
+	stream := d.stream(d.videoIdx)
+
+	var codec *C.AVCodec
+	switch stream.codecpar.codec_id {
+	case C.AV_CODEC_ID_AV1:
+		name := C.CString("libdav1d")
+		codec = C.avcodec_find_decoder_by_name(name)
+		C.free(unsafe.Pointer(name))
+
+		if codec == nil {
+			name = C.CString("av1")
+			codec = C.avcodec_find_decoder_by_name(name)
+			C.free(unsafe.Pointer(name))
+		}
+	default:
+		codec = C.avcodec_find_decoder(stream.codecpar.codec_id)
+	}
+	if codec == nil {
+		return fmt.Errorf("no video decoder for codec_id %d", stream.codecpar.codec_id)
+	}
+
+	d.vCodecCtx = C.avcodec_alloc_context3(codec)
+	if d.vCodecCtx == nil {
+		return fmt.Errorf("avcodec_alloc_context3 video failed")
+	}
+	if ret := C.avcodec_parameters_to_context(d.vCodecCtx, stream.codecpar); ret < 0 {
+		return fmt.Errorf("avcodec_parameters_to_context video: %d", ret)
+	}
+	d.vCodecCtx.get_format = (*[0]byte)(unsafe.Pointer(C.get_sw_format))
+	d.vCodecCtx.hwaccel = nil
+	d.vCodecCtx.hwaccel_context = nil
+	if ret := C.avcodec_open2(d.vCodecCtx, codec, nil); ret < 0 {
+		return fmt.Errorf("avcodec_open2 video: %d", ret)
+	}
+	return nil
+}
+
+func (d *Decoder) openAudioCodec() error {
+	stream := d.stream(d.audioIdx)
+	codec := C.avcodec_find_decoder(stream.codecpar.codec_id)
+	if codec == nil {
+		return fmt.Errorf("no audio decoder for codec_id %d", stream.codecpar.codec_id)
+	}
+
+	d.aCodecCtx = C.avcodec_alloc_context3(codec)
+	if d.aCodecCtx == nil {
+		return fmt.Errorf("avcodec_alloc_context3 audio failed")
+	}
+	if ret := C.avcodec_parameters_to_context(d.aCodecCtx, stream.codecpar); ret < 0 {
+		return fmt.Errorf("avcodec_parameters_to_context audio: %d", ret)
+	}
+	if ret := C.avcodec_open2(d.aCodecCtx, codec, nil); ret < 0 {
+		return fmt.Errorf("avcodec_open2 audio: %d", ret)
+	}
+
+	d.swrCtx = C.swr_alloc()
+	if d.swrCtx == nil {
+		return fmt.Errorf("swr_alloc failed")
+	}
+
+	inLayout := C.CString("in_chlayout")
+	outLayout := C.CString("out_chlayout")
+	inRate := C.CString("in_sample_rate")
+	outRate := C.CString("out_sample_rate")
+	inFmt := C.CString("in_sample_fmt")
+	outFmt := C.CString("out_sample_fmt")
+	defer func() {
+		C.free(unsafe.Pointer(inLayout))
+		C.free(unsafe.Pointer(outLayout))
+		C.free(unsafe.Pointer(inRate))
+		C.free(unsafe.Pointer(outRate))
+		C.free(unsafe.Pointer(inFmt))
+		C.free(unsafe.Pointer(outFmt))
+	}()
+
+	C.av_opt_set_chlayout(unsafe.Pointer(d.swrCtx), inLayout, &d.aCodecCtx.ch_layout, 0)
+	C.av_opt_set_chlayout(unsafe.Pointer(d.swrCtx), outLayout, &d.aCodecCtx.ch_layout, 0)
+	C.av_opt_set_int(unsafe.Pointer(d.swrCtx), inRate, C.int64_t(d.aCodecCtx.sample_rate), 0)
+	C.av_opt_set_int(unsafe.Pointer(d.swrCtx), outRate, C.int64_t(d.aCodecCtx.sample_rate), 0)
+	C.av_opt_set_int(unsafe.Pointer(d.swrCtx), inFmt, C.int64_t(d.aCodecCtx.sample_fmt), 0)
+	C.av_opt_set_int(unsafe.Pointer(d.swrCtx), outFmt, C.int64_t(C.AV_SAMPLE_FMT_FLTP), 0)
+
+	if ret := C.swr_init(d.swrCtx); ret < 0 {
+		return fmt.Errorf("swr_init: %d", ret)
+	}
+	return nil
+}
+
+func (d *Decoder) stream(idx C.int) *C.AVStream {
+	return *(**C.AVStream)(unsafe.Pointer(
+		uintptr(unsafe.Pointer(d.fmtCtx.streams)) + uintptr(idx)*unsafe.Sizeof(uintptr(0)),
+	))
+}
+
+func (d *Decoder) buildMeta() {
+	d.meta = Meta{
+		Duration: time.Duration(
+			float64(d.fmtCtx.duration) *
+				float64(time.Second) /
+				float64(C.AV_TIME_BASE),
+		),
+	}
+
+	if d.videoIdx >= 0 {
+		s := d.stream(d.videoIdx)
+		d.meta.Video = &VideoMeta{
+			Width:     int(s.codecpar.width),
+			Height:    int(s.codecpar.height),
+			FPSNum:    int(s.avg_frame_rate.num),
+			FPSDen:    int(s.avg_frame_rate.den),
+			CodecName: C.GoString(C.avcodec_get_name(s.codecpar.codec_id)),
+		}
+	}
+	if d.audioIdx >= 0 {
+		s := d.stream(d.audioIdx)
+		d.meta.Audio = &AudioMeta{
+			SampleRate: int(s.codecpar.sample_rate),
+			Channels:   int(s.codecpar.ch_layout.nb_channels),
+			CodecName:  C.GoString(C.avcodec_get_name(s.codecpar.codec_id)),
+		}
+	}
+}
+
+// NewDecoder creates a decoder that reads media data
+// from r using FFmpeg custom I/O.
+//
+// If r also implements io.Seeker, FFmpeg may perform
+// random access operations when required by the
+// container format.
+//
+// When seeking is unavailable:
+//
+//   - decoding remains fully supported
+//   - metadata extraction may be less accurate
+//   - reported duration may be unavailable
+//   - some container formats may not expose all stream
+//     information until more data has been read
+//
+// Typical streaming sources such as HTTP responses,
+// SSH pipes and stdin are supported.
+func (d *Decoder) Meta() Meta { return d.meta }
+
+// DecodeFrame decodes the next available frame.
+//
+// DecodeFrame works with both seekable and
+// non-seekable inputs.
+//
+// For streaming inputs FFmpeg processes data
+// incrementally as it becomes available.
+func (d *Decoder) DecodeFrame() (*Frame, error) {
+	for {
+		ret := C.av_read_frame(d.fmtCtx, d.avPkt)
+		if ret == C.int(C.AVERROR_EOF) {
+			return nil, nil
+		}
+		if ret < 0 {
+			return nil, fmt.Errorf("av_read_frame: %d", ret)
+		}
+
+		streamIdx := C.int(d.avPkt.stream_index)
+		var frame *Frame
+		var err error
+
+		switch streamIdx {
+		case d.videoIdx:
+			frame, err = d.decodeVideo()
+		case d.audioIdx:
+			frame, err = d.decodeAudio()
+		}
+
+		C.av_packet_unref(d.avPkt)
+
+		if err != nil {
+			return nil, err
+		}
+		if frame != nil {
+			return frame, nil
+		}
+	}
+}
+
+func (d *Decoder) decodeVideo() (*Frame, error) {
+	ret := C.avcodec_send_packet(d.vCodecCtx, d.avPkt)
+	if ret < 0 {
+		return nil, fmt.Errorf("avcodec_send_packet video: %d", ret)
+	}
+	ret = C.avcodec_receive_frame(d.vCodecCtx, d.avFrame)
+	if ret == -C.EAGAIN || ret == C.int(C.AVERROR_EOF) {
+		return nil, nil
+	}
+	if ret < 0 {
+		return nil, fmt.Errorf("avcodec_receive_frame video: %d", ret)
+	}
+	defer C.av_frame_unref(d.avFrame)
+
+	img, err := frameToNRGBA(d.avFrame)
+	if err != nil {
+		return nil, err
+	}
+
+	stream := d.stream(d.videoIdx)
+	tb := stream.time_base
+	pts := streamTimeToDuration(int64(d.avFrame.pts), tb.num, tb.den)
+	dur := streamTimeToDuration(int64(d.avFrame.duration), tb.num, tb.den)
+
+	return &Frame{Video: &VideoFrame{Img: img, PTS: pts, Duration: dur}}, nil
+}
+
+func (d *Decoder) decodeAudio() (*Frame, error) {
+	ret := C.avcodec_send_packet(d.aCodecCtx, d.avPkt)
+	if ret < 0 {
+		return nil, fmt.Errorf("avcodec_send_packet audio: %d", ret)
+	}
+	ret = C.avcodec_receive_frame(d.aCodecCtx, d.avFrame)
+	if ret == -C.EAGAIN || ret == C.int(C.AVERROR_EOF) {
+		return nil, nil
+	}
+	if ret < 0 {
+		return nil, fmt.Errorf("avcodec_receive_frame audio: %d", ret)
+	}
+	defer C.av_frame_unref(d.avFrame)
+
+	nSamples := int(d.avFrame.nb_samples)
+	nChannels := int(d.aCodecCtx.ch_layout.nb_channels)
+
+	outBuf := make([]float32, nSamples*nChannels)
+	outPtr := (**C.uint8_t)(unsafe.Pointer(&outBuf[0]))
+	outSamples := C.swr_convert(d.swrCtx, outPtr, C.int(nSamples),
+		(**C.uint8_t)(unsafe.Pointer(&d.avFrame.data[0])), C.int(nSamples))
+	if outSamples < 0 {
+		return nil, fmt.Errorf("swr_convert: %d", outSamples)
+	}
+
+	stream := d.stream(d.audioIdx)
+	tb := stream.time_base
+	pts := streamTimeToDuration(int64(d.avFrame.pts), tb.num, tb.den)
+
+	return &Frame{Audio: &AudioFrame{
+		Samples:    outBuf[:int(outSamples)*nChannels],
+		Channels:   nChannels,
+		SampleRate: int(d.aCodecCtx.sample_rate),
+		PTS:        pts,
+	}}, nil
+}
+
+func frameToNRGBA(f *C.AVFrame) (*image.NRGBA, error) {
+	w := int(f.width)
+	h := int(f.height)
+	img := image.NewNRGBA(image.Rect(0, 0, w, h))
+
+	if f.format == C.int(C.AV_PIX_FMT_YUV420P) {
+		yPlane := unsafe.Slice((*byte)(unsafe.Pointer(f.data[0])), int(f.linesize[0])*h)
+		uPlane := unsafe.Slice((*byte)(unsafe.Pointer(f.data[1])), int(f.linesize[1])*h/2)
+		vPlane := unsafe.Slice((*byte)(unsafe.Pointer(f.data[2])), int(f.linesize[2])*h/2)
+		yStride := int(f.linesize[0])
+		uStride := int(f.linesize[1])
+		for row := range h {
+			for col := range w {
+				yv := float64(yPlane[row*yStride+col])
+				uv := float64(uPlane[(row/2)*uStride+col/2]) - 128
+				vv := float64(vPlane[(row/2)*uStride+col/2]) - 128
+				off := img.PixOffset(col, row)
+				img.Pix[off+0] = clamp(yv + 1.402*vv)
+				img.Pix[off+1] = clamp(yv - 0.344136*uv - 0.714136*vv)
+				img.Pix[off+2] = clamp(yv + 1.772*uv)
+				img.Pix[off+3] = 255
+			}
+		}
+		return img, nil
+	}
+	return nil, fmt.Errorf("unsupported pixel format: %d", int(f.format))
+}
+
+func clamp(v float64) uint8 {
+	if v < 0 {
+		return 0
+	}
+	if v > 255 {
+		return 255
+	}
+	return uint8(v)
+}
+
+func streamTimeToDuration(value int64, num C.int, den C.int) time.Duration {
+	if den == 0 {
+		return 0
+	}
+
+	seconds :=
+		float64(value) *
+			float64(num) /
+			float64(den)
+
+	return time.Duration(seconds * float64(time.Second))
+}
+
+// Close releases all FFmpeg resources owned by
+// the decoder.
+//
+// It is safe to call Close multiple times.
+func (d *Decoder) Close() {
+	if d.avFrame != nil {
+		C.av_frame_free(&d.avFrame)
+	}
+	if d.avPkt != nil {
+		C.av_packet_free(&d.avPkt)
+	}
+	if d.swrCtx != nil {
+		C.swr_free(&d.swrCtx)
+	}
+	if d.vCodecCtx != nil {
+		C.avcodec_free_context(&d.vCodecCtx)
+	}
+	if d.aCodecCtx != nil {
+		C.avcodec_free_context(&d.aCodecCtx)
+	}
+	if d.fmtCtx != nil {
+		C.avformat_close_input(&d.fmtCtx)
+	}
+	if d.avioCtx != nil {
+		C.avio_context_free(&d.avioCtx)
+	}
+	if d.handle != 0 {
+		d.handle.Delete()
+		d.handle = 0
+	}
+}