package youtube import ( "context" "encoding/json" "errors" "fmt" "strconv" "strings" ) var ( ErrTranscriptDisabled = errors.New("transcript is disabled on this video") ) // TranscriptSegment is a single transcipt segment spanning a few milliseconds. type TranscriptSegment struct { // Text is the transcipt text. Text string `json:"text"` // StartMs is the start timestamp in ms. StartMs int `json:"offset"` // OffsetText e.g. '4:00'. OffsetText string `json:"offsetText"` // Duration the transcript segment spans in ms. Duration int `json:"duration"` } func (tr TranscriptSegment) String() string { return tr.OffsetText + " - " + strings.TrimSpace(tr.Text) } type VideoTranscript []TranscriptSegment func (vt VideoTranscript) String() string { var str string for _, tr := range vt { str += tr.String() + "\n" } return str } // GetTranscript fetches the video transcript if available. // // Not all videos have transcripts, only relatively new videos. // If transcripts are disabled or not available, ErrTranscriptDisabled is returned. func (c *Client) GetTranscript(video *Video, lang string) (VideoTranscript, error) { return c.GetTranscriptCtx(context.Background(), video, lang) } // GetTranscriptCtx fetches the video transcript if available. // // Not all videos have transcripts, only relatively new videos. // If transcripts are disabled or not available, ErrTranscriptDisabled is returned. func (c *Client) GetTranscriptCtx(ctx context.Context, video *Video, lang string) (VideoTranscript, error) { c.assureClient() if video == nil || video.ID == "" { return nil, fmt.Errorf("no video provided") } body, err := c.transcriptDataByInnertube(ctx, video.ID, lang) if err != nil { return nil, err } transcript, err := parseTranscript(body) if err != nil { return nil, err } return transcript, nil } func parseTranscript(body []byte) (VideoTranscript, error) { var resp transcriptResp if err := json.Unmarshal(body, &resp); err != nil { return nil, err } if len(resp.Actions) > 0 { // Android client response if app := resp.Actions[0].AppSegment; app != nil { return getSegments(app) } // Web client response if web := resp.Actions[0].WebSegment; web != nil { return nil, fmt.Errorf("not implemented") } } return nil, ErrTranscriptDisabled } type segmenter interface { ParseSegments() []TranscriptSegment } func getSegments(f segmenter) (VideoTranscript, error) { if segments := f.ParseSegments(); len(segments) > 0 { return segments, nil } return nil, ErrTranscriptDisabled } // transcriptResp is the JSON structure as returned by the transcript API. type transcriptResp struct { Actions []struct { AppSegment *appData `json:"elementsCommand"` WebSegment *webData `json:"updateEngagementPanelAction"` } `json:"actions"` } type appData struct { TEC struct { Args struct { ListArgs struct { Ow struct { InitialSeg []struct { TranscriptSegment struct { StartMs string `json:"startMs"` EndMs string `json:"endMs"` Text struct { String struct { // Content is the actual transctipt text Content string `json:"content"` } `json:"elementsAttributedString"` } `json:"snippet"` StartTimeText struct { String struct { // Content is the fomratted timestamp, e.g. '4:00' Content string `json:"content"` } `json:"elementsAttributedString"` } `json:"startTimeText"` } `json:"transcriptSegmentRenderer"` } `json:"initialSegments"` } `json:"overwrite"` } `json:"transformTranscriptSegmentListArguments"` } `json:"arguments"` } `json:"transformEntityCommand"` } func (s *appData) ParseSegments() []TranscriptSegment { rawSegments := s.TEC.Args.ListArgs.Ow.InitialSeg segments := make([]TranscriptSegment, 0, len(rawSegments)) for _, segment := range rawSegments { startMs, _ := strconv.Atoi(segment.TranscriptSegment.StartMs) endMs, _ := strconv.Atoi(segment.TranscriptSegment.EndMs) segments = append(segments, TranscriptSegment{ Text: segment.TranscriptSegment.Text.String.Content, StartMs: startMs, OffsetText: segment.TranscriptSegment.StartTimeText.String.Content, Duration: endMs - startMs, }) } return segments } type webData struct { Content struct { TR struct { Body struct { TBR struct { Cues []struct { Transcript struct { FormattedStartOffset struct { SimpleText string `json:"simpleText"` } `json:"formattedStartOffset"` Cues []struct { TranscriptCueRenderer struct { Cue struct { SimpleText string `json:"simpleText"` } `json:"cue"` StartOffsetMs string `json:"startOffsetMs"` DurationMs string `json:"durationMs"` } `json:"transcriptCueRenderer"` } `json:"cues"` } `json:"transcriptCueGroupRenderer"` } `json:"cueGroups"` } `json:"transcriptSearchPanelRenderer"` } `json:"content"` } `json:"transcriptRenderer"` } `json:"content"` } func (s *webData) ParseSegments() []TranscriptSegment { // TODO: doesn't actually work now, check json. cues := s.Content.TR.Body.TBR.Cues segments := make([]TranscriptSegment, 0, len(cues)) for _, s := range cues { formatted := s.Transcript.FormattedStartOffset.SimpleText segment := s.Transcript.Cues[0].TranscriptCueRenderer start, _ := strconv.Atoi(segment.StartOffsetMs) duration, _ := strconv.Atoi(segment.DurationMs) segments = append(segments, TranscriptSegment{ Text: segment.Cue.SimpleText, StartMs: start, OffsetText: formatted, Duration: duration, }) } return segments }