package repository import ( "errors" "fmt" "log/slog" "regexp" "slices" "strconv" "strings" "time" "github.com/go-shiori/dom" "github.com/golang-module/carbon/v2" "github.com/jmoiron/sqlx" "github.com/spf13/viper" "git.amok.space/yevhen/resource-scraper/helper/parser" "git.amok.space/yevhen/resource-scraper/pkg/repository/table" "git.amok.space/yevhen/resource-scraper/types/constant" "git.amok.space/yevhen/resource-scraper/types/model" ) type Prescene struct { db *sqlx.DB } func NewPresceneRepository(db *sqlx.DB) *Prescene { return &Prescene{db: db} } func (s *Prescene) GetPage(pageNumbers []string) ([]model.ExternalSources, error) { entries := make([]model.ExternalSources, 0) endpoint := viper.GetString(constant.CfgKeyEndpoint) uri := viper.GetString(constant.FlagSingleUri) if uri != "" { url := fmt.Sprintf("%s/%s", strings.Trim(endpoint, "/"), strings.Trim(uri, "/")) result, _ := parseUrl(url, s.db) entries = append(entries, result...) } else { for _, t := range pageNumbers { if t != "1" { endpoint += fmt.Sprintf(viper.GetString(constant.CfgKeyEndpointNext), t) } //doc, err := parser.HTMLSourceFromURL("https://mdb.amok.space/$/scnlog.html") if result, err := parseUrl(endpoint, s.db); err == nil { entries = append(entries, result...) } else { slog.Error("parsing url", "err", err) } //fmt.Println("Sleeping...", j) time.Sleep(viper.GetDuration(constant.CfgKeySleepBeforeNextIteration)) } } return entries, nil } func parseUrl(endpoint string, db *sqlx.DB) ([]model.ExternalSources, error) { entries := make([]model.ExternalSources, 0) tags := viper.GetStringMapStringSlice("groups.tags") slog.Info("singleton", "url", endpoint) doc, err := parser.HTMLSourceFromURL(endpoint) if err != nil { return nil, err } if doc == nil { return nil, errors.New("document is nil") } var validID = regexp.MustCompile(`-\d+\/$`) for i, item := range dom.QuerySelectorAll(doc, ".post.type-post.category-flac.category-music") { var es model.ExternalSources columns := []string{"`type`", "type_id", "title", "eXsource", "releaser", "created"} title := dom.QuerySelector(item, ".title") if title != nil { anchor := dom.QuerySelector(title, "h1 > a") if anchor != nil { es.Type = constant.ScopePrescene es.Title = dom.GetAttribute(anchor, "title") if es.Title == "Auto Draft" { slog.Info("Skipped", "title", es.Title) continue } es.ExSource = dom.GetAttribute(anchor, "href") if validID.MatchString(es.ExSource) { continue } pattern := regexp.MustCompile(`(?is)-(\w+)$`) es.Releaser = pattern.FindStringSubmatch(es.Title)[1] for flag, groups := range tags { if slices.Contains(groups, es.Releaser) { es.A = flag es.H = flag columns = append(columns, "a", "h") break } } } if es.A == constant.TagIgnore { slog.Info("Skipped", "releaser", es.Releaser) continue } localtime := dom.QuerySelector(title, "small > span.localtime") if localtime != nil { lc := dom.GetAttribute(localtime, "data-lttime") es.Created = carbon.Parse(lc) } } cls := dom.GetAttribute(item, "class") pattern := regexp.MustCompile(`(?s)^post-(\d+)\spost`) es.TypeId, _ = strconv.Atoi(pattern.FindStringSubmatch(cls)[1]) esModel := table.ExternalSources{Columns: columns} entry := esModel.InsertOnDuplicate(es, db) entries = append(entries, entry) fmt.Println("====================== ", i, " ==============================") fmt.Printf("%+v\n", entry) } return entries, nil }