Files
2026-06-07 00:40:27 +03:00

636 lines
16 KiB
Go

package gopeg
/*
#cgo CFLAGS: -I${SRCDIR}/vendor/include
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/vendor/linux_amd64 -lavformat -lavcodec -lavutil -lswresample -ldav1d -lm -lz -lpthread -ldrm
#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/vendor/linux_arm64 -lavformat -lavcodec -lavutil -lswresample -ldav1d -lm -lz -lpthread -ldrm
#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/vendor/windows_amd64 -lavformat -lavcodec -lavutil -lswresample -ldav1d -lm -lz
#cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/vendor/darwin_arm64 -lavformat -lavcodec -lavutil -lswresample -ldav1d -lm -lz
#cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/vendor/darwin_amd64 -lavformat -lavcodec -lavutil -lswresample -ldav1d -lm -lz
#include <libavformat/avformat.h>
#include <libavcodec/avcodec.h>
#include <libavutil/imgutils.h>
#include <libavutil/opt.h>
#include <libswresample/swresample.h>
#include <stdlib.h>
#include <string.h>
#include <libavutil/pixfmt.h>
#include <libavutil/channel_layout.h>
extern int goReadPacket(void *opaque, uint8_t *buf, int buf_size);
extern int64_t goSeek(void *opaque, int64_t offset, int whence);
static AVIOContext* alloc_avio_context(void *opaque) {
const int bufSize = 65536;
uint8_t *buf = (uint8_t*)av_malloc(bufSize);
if (!buf) return NULL;
return avio_alloc_context(buf, bufSize, 0, opaque, goReadPacket, NULL, goSeek);
}
extern enum AVPixelFormat get_sw_format(AVCodecContext *ctx, const enum AVPixelFormat *pix_fmts);
*/
import "C"
import (
"errors"
"fmt"
"image"
"io"
"runtime/cgo"
"time"
"unsafe"
)
// VideoMeta describes the video stream found in the media source.
type VideoMeta struct {
// Width is the encoded frame width in pixels.
Width int
// Height is the encoded frame height in pixels.
Height int
// FPSNum is the frame-rate numerator.
FPSNum int
// FPSDen is the frame-rate denominator.
FPSDen int
// CodecName is the FFmpeg codec identifier
// (for example: "h264", "hevc", "vp9", "av1").
CodecName string
}
// AudioMeta describes the audio stream found in the media source.
type AudioMeta struct {
// SampleRate is the stream sample rate in Hz.
SampleRate int
// Channels is the number of audio channels.
Channels int
// CodecName is the FFmpeg codec identifier
// (for example: "aac", "opus", "mp3").
CodecName string
}
// Meta contains container-level media information.
type Meta struct {
// Duration is the total media duration.
Duration time.Duration
// Video contains video stream information.
// Nil if the source has no video stream.
Video *VideoMeta
// Audio contains audio stream information.
// Nil if the source has no audio stream.
Audio *AudioMeta
}
// VideoFrame represents a decoded video frame.
type VideoFrame struct {
// Img contains the decoded image in NRGBA format.
Img *image.NRGBA
// PTS is the presentation timestamp relative
// to the beginning of the stream.
PTS time.Duration
// Duration is the display duration of the frame.
Duration time.Duration
}
// AudioFrame represents decoded PCM audio samples.
type AudioFrame struct {
// Samples contains interleaved float32 PCM samples.
Samples []float32
// Channels is the number of channels represented
// in Samples.
Channels int
// SampleRate is the audio sample rate in Hz.
SampleRate int
// PTS is the presentation timestamp relative
// to the beginning of the stream.
PTS time.Duration
}
// Frame represents either a decoded video frame
// or a decoded audio frame.
type Frame struct {
// Video is populated when the decoded frame
// contains video data.
Video *VideoFrame
// Audio is populated when the decoded frame
// contains audio data.
Audio *AudioFrame
}
// Decoder provides sequential decoding of audio
// and video frames from an io.ReadSeeker source.
type Decoder struct {
handle cgo.Handle
fmtCtx *C.AVFormatContext
videoIdx C.int
audioIdx C.int
vCodecCtx *C.AVCodecContext
aCodecCtx *C.AVCodecContext
swrCtx *C.SwrContext
avFrame *C.AVFrame
avPkt *C.AVPacket
avioCtx *C.AVIOContext
meta Meta
}
type readerState struct {
r io.Reader
s io.Seeker
}
//export goReadPacket
func goReadPacket(opaque unsafe.Pointer, buf *C.uint8_t, bufSize C.int) C.int {
h := *(*cgo.Handle)(opaque)
rs := h.Value().(*readerState)
b := make([]byte, int(bufSize))
n, err := rs.r.Read(b)
if n > 0 {
C.memcpy(unsafe.Pointer(buf), unsafe.Pointer(&b[0]), C.size_t(n))
}
if err != nil {
if errors.Is(err, io.EOF) {
return C.int(C.AVERROR_EOF)
}
return -1
}
return C.int(n)
}
//export goSeek
func goSeek(opaque unsafe.Pointer, offset C.int64_t, whence C.int) C.int64_t {
h := *(*cgo.Handle)(opaque)
rs := h.Value().(*readerState)
if rs.s == nil {
return -1
}
switch int(whence) {
case 0x10000:
if sz, ok := rs.r.(interface{ Size() int64 }); ok {
return C.int64_t(sz.Size())
}
return -1
}
var seekWhence int
switch int(whence) {
case 0:
seekWhence = io.SeekStart
case 1:
seekWhence = io.SeekCurrent
case 2:
seekWhence = io.SeekEnd
default:
return -1
}
n, err := rs.s.Seek(int64(offset), seekWhence)
if err != nil {
return -1
}
return C.int64_t(n)
}
// NewDecoder creates a decoder that reads media data
// from r using FFmpeg custom I/O.
//
// If r also implements io.Seeker, FFmpeg may perform
// random access operations when required by the
// container format.
//
// When seeking is unavailable:
//
// - decoding remains fully supported
// - metadata extraction may be less accurate
// - reported duration may be unavailable
// - some container formats may not expose all stream
// information until more data has been read
//
// Typical streaming sources such as HTTP responses,
// SSH pipes and stdin are supported.
func NewDecoder(r io.Reader) (*Decoder, error) {
d := &Decoder{
videoIdx: -1,
audioIdx: -1,
}
rs := &readerState{
r: r,
}
if s, ok := r.(io.Seeker); ok {
rs.s = s
}
d.handle = cgo.NewHandle(rs)
handlePtr := (*C.uchar)(unsafe.Pointer(&d.handle))
avioCtx := C.alloc_avio_context(unsafe.Pointer(handlePtr))
if avioCtx == nil {
d.handle.Delete()
return nil, fmt.Errorf("avio_alloc_context failed")
}
d.avioCtx = avioCtx
d.fmtCtx = C.avformat_alloc_context()
if d.fmtCtx == nil {
C.av_free(unsafe.Pointer(avioCtx))
d.handle.Delete()
return nil, fmt.Errorf("avformat_alloc_context failed")
}
d.fmtCtx.pb = avioCtx
d.fmtCtx.flags |= C.AVFMT_FLAG_CUSTOM_IO
C.av_log_set_level(C.AV_LOG_QUIET)
ret := C.avformat_open_input(&d.fmtCtx, nil, nil, nil)
if ret < 0 {
d.handle.Delete()
return nil, fmt.Errorf("avformat_open_input: %d", ret)
}
ret = C.avformat_find_stream_info(d.fmtCtx, nil)
if ret < 0 {
C.avformat_close_input(&d.fmtCtx)
d.handle.Delete()
return nil, fmt.Errorf("avformat_find_stream_info: %d", ret)
}
for i := C.uint(0); i < d.fmtCtx.nb_streams; i++ {
stream := *(**C.AVStream)(unsafe.Add(
unsafe.Pointer(d.fmtCtx.streams),
uintptr(i)*unsafe.Sizeof(uintptr(0))),
)
switch stream.codecpar.codec_type {
case C.AVMEDIA_TYPE_VIDEO:
if d.videoIdx < 0 {
d.videoIdx = C.int(i)
}
case C.AVMEDIA_TYPE_AUDIO:
if d.audioIdx < 0 {
d.audioIdx = C.int(i)
}
}
}
if d.videoIdx >= 0 {
if err := d.openVideoCodec(); err != nil {
d.Close()
return nil, err
}
}
if d.audioIdx >= 0 {
if err := d.openAudioCodec(); err != nil {
d.Close()
return nil, err
}
}
d.avFrame = C.av_frame_alloc()
d.avPkt = C.av_packet_alloc()
d.buildMeta()
return d, nil
}
func (d *Decoder) openVideoCodec() error {
stream := d.stream(d.videoIdx)
var codec *C.AVCodec
switch stream.codecpar.codec_id {
case C.AV_CODEC_ID_AV1:
name := C.CString("libdav1d")
codec = C.avcodec_find_decoder_by_name(name)
C.free(unsafe.Pointer(name))
if codec == nil {
name = C.CString("av1")
codec = C.avcodec_find_decoder_by_name(name)
C.free(unsafe.Pointer(name))
}
default:
codec = C.avcodec_find_decoder(stream.codecpar.codec_id)
}
if codec == nil {
return fmt.Errorf("no video decoder for codec_id %d", stream.codecpar.codec_id)
}
d.vCodecCtx = C.avcodec_alloc_context3(codec)
if d.vCodecCtx == nil {
return fmt.Errorf("avcodec_alloc_context3 video failed")
}
if ret := C.avcodec_parameters_to_context(d.vCodecCtx, stream.codecpar); ret < 0 {
return fmt.Errorf("avcodec_parameters_to_context video: %d", ret)
}
d.vCodecCtx.get_format = (*[0]byte)(unsafe.Pointer(C.get_sw_format))
d.vCodecCtx.hwaccel = nil
d.vCodecCtx.hwaccel_context = nil
if ret := C.avcodec_open2(d.vCodecCtx, codec, nil); ret < 0 {
return fmt.Errorf("avcodec_open2 video: %d", ret)
}
return nil
}
func (d *Decoder) openAudioCodec() error {
stream := d.stream(d.audioIdx)
codec := C.avcodec_find_decoder(stream.codecpar.codec_id)
if codec == nil {
return fmt.Errorf("no audio decoder for codec_id %d", stream.codecpar.codec_id)
}
d.aCodecCtx = C.avcodec_alloc_context3(codec)
if d.aCodecCtx == nil {
return fmt.Errorf("avcodec_alloc_context3 audio failed")
}
if ret := C.avcodec_parameters_to_context(d.aCodecCtx, stream.codecpar); ret < 0 {
return fmt.Errorf("avcodec_parameters_to_context audio: %d", ret)
}
if ret := C.avcodec_open2(d.aCodecCtx, codec, nil); ret < 0 {
return fmt.Errorf("avcodec_open2 audio: %d", ret)
}
d.swrCtx = C.swr_alloc()
if d.swrCtx == nil {
return fmt.Errorf("swr_alloc failed")
}
inLayout := C.CString("in_chlayout")
outLayout := C.CString("out_chlayout")
inRate := C.CString("in_sample_rate")
outRate := C.CString("out_sample_rate")
inFmt := C.CString("in_sample_fmt")
outFmt := C.CString("out_sample_fmt")
defer func() {
C.free(unsafe.Pointer(inLayout))
C.free(unsafe.Pointer(outLayout))
C.free(unsafe.Pointer(inRate))
C.free(unsafe.Pointer(outRate))
C.free(unsafe.Pointer(inFmt))
C.free(unsafe.Pointer(outFmt))
}()
C.av_opt_set_chlayout(unsafe.Pointer(d.swrCtx), inLayout, &d.aCodecCtx.ch_layout, 0)
C.av_opt_set_chlayout(unsafe.Pointer(d.swrCtx), outLayout, &d.aCodecCtx.ch_layout, 0)
C.av_opt_set_int(unsafe.Pointer(d.swrCtx), inRate, C.int64_t(d.aCodecCtx.sample_rate), 0)
C.av_opt_set_int(unsafe.Pointer(d.swrCtx), outRate, C.int64_t(d.aCodecCtx.sample_rate), 0)
C.av_opt_set_int(unsafe.Pointer(d.swrCtx), inFmt, C.int64_t(d.aCodecCtx.sample_fmt), 0)
C.av_opt_set_int(unsafe.Pointer(d.swrCtx), outFmt, C.int64_t(C.AV_SAMPLE_FMT_FLTP), 0)
if ret := C.swr_init(d.swrCtx); ret < 0 {
return fmt.Errorf("swr_init: %d", ret)
}
return nil
}
func (d *Decoder) stream(idx C.int) *C.AVStream {
return *(**C.AVStream)(unsafe.Pointer(
uintptr(unsafe.Pointer(d.fmtCtx.streams)) + uintptr(idx)*unsafe.Sizeof(uintptr(0)),
))
}
func (d *Decoder) buildMeta() {
d.meta = Meta{
Duration: time.Duration(
float64(d.fmtCtx.duration) *
float64(time.Second) /
float64(C.AV_TIME_BASE),
),
}
if d.videoIdx >= 0 {
s := d.stream(d.videoIdx)
d.meta.Video = &VideoMeta{
Width: int(s.codecpar.width),
Height: int(s.codecpar.height),
FPSNum: int(s.avg_frame_rate.num),
FPSDen: int(s.avg_frame_rate.den),
CodecName: C.GoString(C.avcodec_get_name(s.codecpar.codec_id)),
}
}
if d.audioIdx >= 0 {
s := d.stream(d.audioIdx)
d.meta.Audio = &AudioMeta{
SampleRate: int(s.codecpar.sample_rate),
Channels: int(s.codecpar.ch_layout.nb_channels),
CodecName: C.GoString(C.avcodec_get_name(s.codecpar.codec_id)),
}
}
}
// NewDecoder creates a decoder that reads media data
// from r using FFmpeg custom I/O.
//
// If r also implements io.Seeker, FFmpeg may perform
// random access operations when required by the
// container format.
//
// When seeking is unavailable:
//
// - decoding remains fully supported
// - metadata extraction may be less accurate
// - reported duration may be unavailable
// - some container formats may not expose all stream
// information until more data has been read
//
// Typical streaming sources such as HTTP responses,
// SSH pipes and stdin are supported.
func (d *Decoder) Meta() Meta { return d.meta }
// DecodeFrame decodes the next available frame.
//
// DecodeFrame works with both seekable and
// non-seekable inputs.
//
// For streaming inputs FFmpeg processes data
// incrementally as it becomes available.
func (d *Decoder) DecodeFrame() (*Frame, error) {
for {
ret := C.av_read_frame(d.fmtCtx, d.avPkt)
if ret == C.int(C.AVERROR_EOF) {
return nil, nil
}
if ret < 0 {
return nil, fmt.Errorf("av_read_frame: %d", ret)
}
streamIdx := C.int(d.avPkt.stream_index)
var frame *Frame
var err error
switch streamIdx {
case d.videoIdx:
frame, err = d.decodeVideo()
case d.audioIdx:
frame, err = d.decodeAudio()
}
C.av_packet_unref(d.avPkt)
if err != nil {
return nil, err
}
if frame != nil {
return frame, nil
}
}
}
func (d *Decoder) decodeVideo() (*Frame, error) {
ret := C.avcodec_send_packet(d.vCodecCtx, d.avPkt)
if ret < 0 {
return nil, fmt.Errorf("avcodec_send_packet video: %d", ret)
}
ret = C.avcodec_receive_frame(d.vCodecCtx, d.avFrame)
if ret == -C.EAGAIN || ret == C.int(C.AVERROR_EOF) {
return nil, nil
}
if ret < 0 {
return nil, fmt.Errorf("avcodec_receive_frame video: %d", ret)
}
defer C.av_frame_unref(d.avFrame)
img, err := frameToNRGBA(d.avFrame)
if err != nil {
return nil, err
}
stream := d.stream(d.videoIdx)
tb := stream.time_base
pts := streamTimeToDuration(int64(d.avFrame.pts), tb.num, tb.den)
dur := streamTimeToDuration(int64(d.avFrame.duration), tb.num, tb.den)
return &Frame{Video: &VideoFrame{Img: img, PTS: pts, Duration: dur}}, nil
}
func (d *Decoder) decodeAudio() (*Frame, error) {
ret := C.avcodec_send_packet(d.aCodecCtx, d.avPkt)
if ret < 0 {
return nil, fmt.Errorf("avcodec_send_packet audio: %d", ret)
}
ret = C.avcodec_receive_frame(d.aCodecCtx, d.avFrame)
if ret == -C.EAGAIN || ret == C.int(C.AVERROR_EOF) {
return nil, nil
}
if ret < 0 {
return nil, fmt.Errorf("avcodec_receive_frame audio: %d", ret)
}
defer C.av_frame_unref(d.avFrame)
nSamples := int(d.avFrame.nb_samples)
nChannels := int(d.aCodecCtx.ch_layout.nb_channels)
outBuf := make([]float32, nSamples*nChannels)
outPtr := (**C.uint8_t)(unsafe.Pointer(&outBuf[0]))
outSamples := C.swr_convert(d.swrCtx, outPtr, C.int(nSamples),
(**C.uint8_t)(unsafe.Pointer(&d.avFrame.data[0])), C.int(nSamples))
if outSamples < 0 {
return nil, fmt.Errorf("swr_convert: %d", outSamples)
}
stream := d.stream(d.audioIdx)
tb := stream.time_base
pts := streamTimeToDuration(int64(d.avFrame.pts), tb.num, tb.den)
return &Frame{Audio: &AudioFrame{
Samples: outBuf[:int(outSamples)*nChannels],
Channels: nChannels,
SampleRate: int(d.aCodecCtx.sample_rate),
PTS: pts,
}}, nil
}
func frameToNRGBA(f *C.AVFrame) (*image.NRGBA, error) {
w := int(f.width)
h := int(f.height)
img := image.NewNRGBA(image.Rect(0, 0, w, h))
if f.format == C.int(C.AV_PIX_FMT_YUV420P) {
yPlane := unsafe.Slice((*byte)(unsafe.Pointer(f.data[0])), int(f.linesize[0])*h)
uPlane := unsafe.Slice((*byte)(unsafe.Pointer(f.data[1])), int(f.linesize[1])*h/2)
vPlane := unsafe.Slice((*byte)(unsafe.Pointer(f.data[2])), int(f.linesize[2])*h/2)
yStride := int(f.linesize[0])
uStride := int(f.linesize[1])
for row := range h {
for col := range w {
yv := float64(yPlane[row*yStride+col])
uv := float64(uPlane[(row/2)*uStride+col/2]) - 128
vv := float64(vPlane[(row/2)*uStride+col/2]) - 128
off := img.PixOffset(col, row)
img.Pix[off+0] = clamp(yv + 1.402*vv)
img.Pix[off+1] = clamp(yv - 0.344136*uv - 0.714136*vv)
img.Pix[off+2] = clamp(yv + 1.772*uv)
img.Pix[off+3] = 255
}
}
return img, nil
}
return nil, fmt.Errorf("unsupported pixel format: %d", int(f.format))
}
func clamp(v float64) uint8 {
if v < 0 {
return 0
}
if v > 255 {
return 255
}
return uint8(v)
}
func streamTimeToDuration(value int64, num C.int, den C.int) time.Duration {
if den == 0 {
return 0
}
seconds :=
float64(value) *
float64(num) /
float64(den)
return time.Duration(seconds * float64(time.Second))
}
// Close releases all FFmpeg resources owned by
// the decoder.
//
// It is safe to call Close multiple times.
func (d *Decoder) Close() {
if d.avFrame != nil {
C.av_frame_free(&d.avFrame)
}
if d.avPkt != nil {
C.av_packet_free(&d.avPkt)
}
if d.swrCtx != nil {
C.swr_free(&d.swrCtx)
}
if d.vCodecCtx != nil {
C.avcodec_free_context(&d.vCodecCtx)
}
if d.aCodecCtx != nil {
C.avcodec_free_context(&d.aCodecCtx)
}
if d.fmtCtx != nil {
C.avformat_close_input(&d.fmtCtx)
}
if d.avioCtx != nil {
C.avio_context_free(&d.avioCtx)
}
if d.handle != 0 {
d.handle.Delete()
d.handle = 0
}
}