resource-scraper/pkg/repository/prescene.go

135 lines
3.5 KiB
Go

package repository
import (
"errors"
"fmt"
"log/slog"
"regexp"
"slices"
"strconv"
"strings"
"time"
"github.com/go-shiori/dom"
"github.com/golang-module/carbon/v2"
"github.com/jmoiron/sqlx"
"github.com/spf13/viper"
"git.amok.space/yevhen/resource-scraper/helper/parser"
"git.amok.space/yevhen/resource-scraper/pkg/repository/table"
"git.amok.space/yevhen/resource-scraper/types/constant"
"git.amok.space/yevhen/resource-scraper/types/model"
)
type Prescene struct {
db *sqlx.DB
}
func NewPresceneRepository(db *sqlx.DB) *Prescene {
return &Prescene{db: db}
}
func (s *Prescene) GetPage(pageNumbers []string) ([]model.ExternalSources, error) {
entries := make([]model.ExternalSources, 0)
endpoint := viper.GetString(constant.CfgKeyEndpoint)
uri := viper.GetString(constant.FlagSingleUri)
if uri != "" {
url := fmt.Sprintf("%s/%s", strings.Trim(endpoint, "/"), strings.Trim(uri, "/"))
result, _ := parseUrl(url, s.db)
entries = append(entries, result...)
} else {
for _, t := range pageNumbers {
if t != "1" {
endpoint += fmt.Sprintf(viper.GetString(constant.CfgKeyEndpointNext), t)
}
//doc, err := parser.HTMLSourceFromURL("https://mdb.amok.space/$/scnlog.html")
if result, err := parseUrl(endpoint, s.db); err == nil {
entries = append(entries, result...)
} else {
slog.Error("parsing url", "err", err)
}
//fmt.Println("Sleeping...", j)
time.Sleep(viper.GetDuration(constant.CfgKeySleepBeforeNextIteration))
}
}
return entries, nil
}
func parseUrl(endpoint string, db *sqlx.DB) ([]model.ExternalSources, error) {
entries := make([]model.ExternalSources, 0)
tags := viper.GetStringMapStringSlice("groups.tags")
slog.Info("singleton", "url", endpoint)
doc, err := parser.HTMLSourceFromURL(endpoint)
if err != nil {
return nil, err
}
if doc == nil {
return nil, errors.New("document is nil")
}
var validID = regexp.MustCompile(`-\d+\/$`)
for i, item := range dom.QuerySelectorAll(doc, ".post.type-post.category-flac.category-music") {
var es model.ExternalSources
columns := []string{"`type`", "type_id", "title", "eXsource", "releaser", "created"}
title := dom.QuerySelector(item, ".title")
if title != nil {
anchor := dom.QuerySelector(title, "h1 > a")
if anchor != nil {
es.Type = constant.ScopePrescene
es.Title = dom.GetAttribute(anchor, "title")
if es.Title == "Auto Draft" {
slog.Info("Skipped", "title", es.Title)
continue
}
es.ExSource = dom.GetAttribute(anchor, "href")
if validID.MatchString(es.ExSource) {
continue
}
pattern := regexp.MustCompile(`(?is)-(\w+)$`)
es.Releaser = pattern.FindStringSubmatch(es.Title)[1]
for flag, groups := range tags {
if slices.Contains(groups, es.Releaser) {
es.A = flag
es.H = flag
columns = append(columns, "a", "h")
break
}
}
}
if es.A == constant.TagIgnore {
slog.Info("Skipped", "releaser", es.Releaser)
continue
}
localtime := dom.QuerySelector(title, "small > span.localtime")
if localtime != nil {
lc := dom.GetAttribute(localtime, "data-lttime")
es.Created = carbon.Parse(lc)
}
}
cls := dom.GetAttribute(item, "class")
pattern := regexp.MustCompile(`(?s)^post-(\d+)\spost`)
es.TypeId, _ = strconv.Atoi(pattern.FindStringSubmatch(cls)[1])
esModel := table.ExternalSources{Columns: columns}
entry := esModel.InsertOnDuplicate(es, db)
entries = append(entries, entry)
fmt.Println("====================== ", i, " ==============================")
fmt.Printf("%+v\n", entry)
}
return entries, nil
}