Classifier enhancements (#158)

2025-12-28 06:34:17 +00:00 · 2024-02-21 17:02:03 +00:00 · 2024-02-21 17:02:03 +00:00 · ab0405196e
commit ab0405196e
parent ce5d913bd8
16 changed files with 148 additions and 273 deletions
--- a/graphql/schema/enums.graphqls
+++ b/graphql/schema/enums.graphqls
@ -3,6 +3,7 @@ enum ContentType {
  tv_show
  music
  ebook
+  comic
  audiobook
  game
  software
--- a/internal/classifier/classification.go
+++ b/internal/classifier/classification.go
@ -47,6 +47,14 @@ func (a *ContentAttributes) ApplyHint(h model.TorrentHint) {
 	}
 }

+func (a *ContentAttributes) InferVideoAttributes(input string) {
+	a.VideoResolution = model.InferVideoResolution(input)
+	a.VideoSource = model.InferVideoSource(input)
+	a.VideoCodec, a.ReleaseGroup = model.InferVideoCodecAndReleaseGroup(input)
+	a.Video3d = model.InferVideo3d(input)
+	a.VideoModifier = model.InferVideoModifier(input)
+}
+
 func (c *Classification) ApplyHint(h model.TorrentHint) {
 	c.ContentType = h.NullContentType()
 	c.ContentAttributes.ApplyHint(h)
--- a/internal/classifier/fallback.go
+++ b/internal/classifier/fallback.go
@ -19,5 +19,9 @@ func (c FallbackClassifier) Priority() int {
 func (c FallbackClassifier) Classify(_ context.Context, t model.Torrent) (Classification, error) {
 	cl := Classification{}
 	cl.ApplyHint(t.Hint)
+	hasVideo := t.HasFileType(model.FileTypeVideo)
+	if hasVideo.Valid && hasVideo.Bool {
+		cl.InferVideoAttributes(t.Name)
+	}
 	return cl, nil
 }
--- a/internal/classifier/keywords/classifier.go
+++ b/internal/classifier/keywords/classifier.go
@ -4,22 +4,14 @@ import (
 	"context"
 	"github.com/bitmagnet-io/bitmagnet/internal/classifier"
 	"github.com/bitmagnet-io/bitmagnet/internal/model"
-	"github.com/bitmagnet-io/bitmagnet/internal/regex"
 	"regexp"
 )

-func NewKeywordsClassifier(contentType model.ContentType, words []string, priority int) classifier.SubClassifier {
-	return keywordsClassifier{
-		contentType: contentType,
-		priority:    priority,
-		regex:       regex.NewRegexFromNames(words...),
-	}
-}
-
 type keywordsClassifier struct {
-	contentType model.ContentType
-	priority    int
-	regex       *regexp.Regexp
+	contentType       model.ContentType
+	priority          int
+	regex             *regexp.Regexp
+	requiredFileTypes []model.FileType
 }

 func (c keywordsClassifier) Key() string {
@ -30,14 +22,25 @@ func (c keywordsClassifier) Priority() int {
 	return c.priority
 }

-func (c keywordsClassifier) Classify(_ context.Context, torrent model.Torrent) (classifier.Classification, error) {
-	if !c.regex.MatchString(torrent.Name) {
+func (c keywordsClassifier) Classify(_ context.Context, t model.Torrent) (classifier.Classification, error) {
+	if !t.Hint.IsNil() || !c.regex.MatchString(t.Name) {
 		return classifier.Classification{}, classifier.ErrNoMatch
 	}
-	return classifier.Classification{
+	if len(c.requiredFileTypes) > 0 {
+		hasRequiredFileTypes := t.HasFileType(c.requiredFileTypes...)
+		if hasRequiredFileTypes.Valid && !hasRequiredFileTypes.Bool {
+			return classifier.Classification{}, classifier.ErrNoMatch
+		}
+	}
+	cl := classifier.Classification{
 		ContentType: model.NullContentType{
 			Valid:       true,
 			ContentType: c.contentType,
 		},
-	}, nil
+	}
+	hasVideo := t.HasFileType(model.FileTypeVideo)
+	if hasVideo.Valid && hasVideo.Bool {
+		cl.InferVideoAttributes(t.Name)
+	}
+	return cl, nil
 }
--- a/internal/classifier/keywords/factory.go
+++ b/internal/classifier/keywords/factory.go
@ -4,22 +4,50 @@ import (
 	"github.com/bitmagnet-io/bitmagnet/internal/boilerplate/lazy"
 	"github.com/bitmagnet-io/bitmagnet/internal/classifier"
 	"github.com/bitmagnet-io/bitmagnet/internal/model"
+	"github.com/bitmagnet-io/bitmagnet/internal/regex"
 	"go.uber.org/fx"
 )

 type Result struct {
 	fx.Out
-	Xxx lazy.Lazy[classifier.SubClassifier] `group:"content_classifiers"`
+	Music     lazy.Lazy[classifier.SubClassifier] `group:"content_classifiers"`
+	Audiobook lazy.Lazy[classifier.SubClassifier] `group:"content_classifiers"`
+	Ebook     lazy.Lazy[classifier.SubClassifier] `group:"content_classifiers"`
+	Xxx       lazy.Lazy[classifier.SubClassifier] `group:"content_classifiers"`
 }

 func New() Result {
 	return Result{
+		Audiobook: lazy.New(func() (classifier.SubClassifier, error) {
+			return keywordsClassifier{
+				contentType:       model.ContentTypeAudiobook,
+				regex:             regex.NewRegexFromNames(audiobookWords...),
+				priority:          20,
+				requiredFileTypes: []model.FileType{model.FileTypeAudio},
+			}, nil
+		}),
+		Music: lazy.New(func() (classifier.SubClassifier, error) {
+			return keywordsClassifier{
+				contentType:       model.ContentTypeMusic,
+				regex:             regex.NewRegexFromNames(musicWords...),
+				priority:          21,
+				requiredFileTypes: []model.FileType{model.FileTypeAudio},
+			}, nil
+		}),
+		Ebook: lazy.New(func() (classifier.SubClassifier, error) {
+			return keywordsClassifier{
+				contentType:       model.ContentTypeAudiobook,
+				regex:             regex.NewRegexFromNames(ebookWords...),
+				priority:          22,
+				requiredFileTypes: []model.FileType{model.FileTypeDocument},
+			}, nil
+		}),
 		Xxx: lazy.New(func() (classifier.SubClassifier, error) {
-			return NewKeywordsClassifier(
-				model.ContentTypeXxx,
-				xxxWords,
-				20,
-			), nil
+			return keywordsClassifier{
+				contentType: model.ContentTypeXxx,
+				regex:       regex.NewRegexFromNames(xxxWords...),
+				priority:    23,
+			}, nil
 		}),
 	}
 }
--- a/internal/classifier/keywords/words.go
+++ b/internal/classifier/keywords/words.go
@ -1,7 +1,53 @@
 package keywords

+var musicWords = []string{
+	"discography",
+	"music",
+	"album",
+	"va",
+	"various",
+	"compilation",
+	"ep",
+	"lp",
+	"single",
+	"vinyl",
+	"classical",
+	"disco",
+	"folk",
+	"hits",
+	"house",
+	"indie",
+	"jazz",
+	"metal",
+	"pop",
+	"jazz",
+	"reggae",
+	"rock",
+	"trance",
+}
+
+var audiobookWords = []string{
+	"audiobook",
+	"audiobooks",
+	"book",
+	"books",
+	"abridged",
+	"unabridged",
+	"narrated",
+}
+
+var ebookWords = []string{
+	"book",
+	"books",
+	"ebook",
+	"ebooks",
+	"abridged",
+	"unabridged",
+}
+
 var xxxWords = []string{
 	"xxx",
 	"porn",
 	"porno",
+	"sex",
 }
--- a/internal/classifier/video/parse.go
+++ b/internal/classifier/video/parse.go
@ -59,7 +59,7 @@ var titleEpisodesRegex = rex.New(
 	),
 ).MustCompile()

-var multiRegex = regex.NewRegexFromNames("multi")
+var multiRegex = regex.NewRegexFromNames("multi", "dual")

 var separatorToken = rex.Chars.Runes(" ._")

@ -168,16 +168,11 @@ func ParseContent(hintCt model.NullContentType, input string) (model.ContentType
 	if ct != model.ContentTypeTvShow {
 		episodes = nil
 	}
-	vc, rg := model.InferVideoCodecAndReleaseGroup(rest)
-	return ct, title, year, classifier.ContentAttributes{
-		Episodes:        episodes,
-		Languages:       model.InferLanguages(rest),
-		LanguageMulti:   multiRegex.MatchString(rest),
-		VideoResolution: model.InferVideoResolution(rest),
-		VideoSource:     model.InferVideoSource(rest),
-		VideoCodec:      vc,
-		Video3d:         model.InferVideo3d(rest),
-		VideoModifier:   model.InferVideoModifier(rest),
-		ReleaseGroup:    rg,
-	}, nil
+	attrs := classifier.ContentAttributes{
+		Episodes:      episodes,
+		Languages:     model.InferLanguages(rest),
+		LanguageMulti: multiRegex.MatchString(rest),
+	}
+	attrs.InferVideoAttributes(rest)
+	return ct, title, year, attrs, nil
 }
--- a/internal/gql/gql.gen.go
+++ b/internal/gql/gql.gen.go
@ -1569,6 +1569,7 @@ var sources = []*ast.Source{
  tv_show
  music
  ebook
+  comic
  audiobook
  game
  software
--- a/internal/model/content_type.go
+++ b/internal/model/content_type.go
@ -1,7 +1,7 @@
 package model

 // ContentType represents the type of content
-// ENUM(movie, tv_show, music, ebook, audiobook, game, software, xxx)
+// ENUM(movie, tv_show, music, ebook, comic, audiobook, game, software, xxx)
 type ContentType string

 func (c ContentType) Label() string {
@ -16,24 +16,28 @@ func (c ContentType) IsVideo() bool {
 	return c == ContentTypeMovie || c == ContentTypeTvShow || c == ContentTypeXxx
 }

+// A map of file extensions to associated content types.
+// The map includes only extensions where the content type can be very reliably inferred.
 var extensionToContentTypeMap = map[string]ContentType{
 	"m4b":     ContentTypeAudiobook,
+	"cb7":     ContentTypeComic,
+	"cba":     ContentTypeComic,
+	"cbr":     ContentTypeComic,
+	"cbt":     ContentTypeComic,
+	"cbz":     ContentTypeComic,
 	"epub":    ContentTypeEbook,
 	"mobi":    ContentTypeEbook,
 	"azw":     ContentTypeEbook,
 	"azw3":    ContentTypeEbook,
 	"pdf":     ContentTypeEbook,
-	"cbr":     ContentTypeEbook,
-	"cbz":     ContentTypeEbook,
-	"cb7":     ContentTypeEbook,
-	"cbt":     ContentTypeEbook,
-	"cba":     ContentTypeEbook,
 	"chm":     ContentTypeEbook,
 	"doc":     ContentTypeEbook,
 	"docx":    ContentTypeEbook,
 	"odt":     ContentTypeEbook,
 	"rtf":     ContentTypeEbook,
 	"djvu":    ContentTypeEbook,
+	"ape":     ContentTypeMusic,
+	"flac":    ContentTypeMusic,
 	"exe":     ContentTypeSoftware,
 	"dmg":     ContentTypeSoftware,
 	"app":     ContentTypeSoftware,
--- a/internal/model/content_type_enum.go
+++ b/internal/model/content_type_enum.go
@ -20,6 +20,7 @@ const (
 	ContentTypeTvShow    ContentType = "tv_show"
 	ContentTypeMusic     ContentType = "music"
 	ContentTypeEbook     ContentType = "ebook"
+	ContentTypeComic     ContentType = "comic"
 	ContentTypeAudiobook ContentType = "audiobook"
 	ContentTypeGame      ContentType = "game"
 	ContentTypeSoftware  ContentType = "software"
@ -33,6 +34,7 @@ var _ContentTypeNames = []string{
 	string(ContentTypeTvShow),
 	string(ContentTypeMusic),
 	string(ContentTypeEbook),
+	string(ContentTypeComic),
 	string(ContentTypeAudiobook),
 	string(ContentTypeGame),
 	string(ContentTypeSoftware),
@ -53,6 +55,7 @@ func ContentTypeValues() []ContentType {
 		ContentTypeTvShow,
 		ContentTypeMusic,
 		ContentTypeEbook,
+		ContentTypeComic,
 		ContentTypeAudiobook,
 		ContentTypeGame,
 		ContentTypeSoftware,
@ -77,6 +80,7 @@ var _ContentTypeValue = map[string]ContentType{
 	"tv_show":   ContentTypeTvShow,
 	"music":     ContentTypeMusic,
 	"ebook":     ContentTypeEbook,
+	"comic":     ContentTypeComic,
 	"audiobook": ContentTypeAudiobook,
 	"game":      ContentTypeGame,
 	"software":  ContentTypeSoftware,
--- a/webui/dist/bitmagnet/index.html
+++ b/webui/dist/bitmagnet/index.html
--- a/webui/dist/bitmagnet/main.568765649ad6b863.js
+++ b/webui/dist/bitmagnet/main.568765649ad6b863.js
--- a/webui/src/app/app.module.ts
+++ b/webui/src/app/app.module.ts
@ -34,7 +34,10 @@ import { AppErrorsService } from "./app-errors.service";
 })
 export class AppModule {
  constructor(iconRegistry: MatIconRegistry, domSanitizer: DomSanitizer) {
-    iconRegistry.setDefaultFontSetClass("material-icons-outlined");
+    iconRegistry.setDefaultFontSetClass(
+      "material-icons-outlined",
+      "material-symbols-outlined",
+    );
    iconRegistry.addSvgIcon(
      "magnet",
      domSanitizer.bypassSecurityTrustResourceUrl("assets/magnet.svg"),
--- a/webui/src/app/graphql/generated/index.ts
+++ b/webui/src/app/graphql/generated/index.ts
@ -71,6 +71,7 @@ export type ContentCollection = {

 export type ContentType =
  | 'audiobook'
+  | 'comic'
  | 'ebook'
  | 'game'
  | 'movie'
--- a/webui/src/app/search/torrent-content/torrent-content-search.engine.ts
+++ b/webui/src/app/search/torrent-content/torrent-content-search.engine.ts
@ -292,6 +292,11 @@ const contentTypes: Record<generated.ContentType | "null", ContentTypeInfo> = {
    plural: "E-Books",
    icon: "auto_stories",
  },
+  comic: {
+    singular: "Comic",
+    plural: "Comics",
+    icon: "comic_bubble",
+  },
  audiobook: {
    singular: "Audiobook",
    plural: "Audiobooks",
--- a/webui/src/index.html
+++ b/webui/src/index.html
@ -12,7 +12,7 @@
      rel="stylesheet"
    />
    <link
-      href="https://fonts.googleapis.com/icon?family=Material+Icons|Material+Icons+Outlined"
+      href="https://fonts.googleapis.com/icon?family=Material+Icons|Material+Icons+Outlined|Material+Symbols+Outlined"
      rel="stylesheet"
    />
  </head>