Browse Source

finish skysnow webcrawler

master
realxlfd 3 months ago
parent
commit
f7babd970f
  1. 78
      net/apis/skysnow/sub.go
  2. 77
      net/utils/crawler/driver.go

78
net/apis/skysnow/sub.go

@ -0,0 +1,78 @@
package skysnow
import (
"bytes"
"errors"
"fmt"
"git.realxlfd.cc/RealXLFD/golib/net/utils/urlbuilder"
"github.com/PuerkitoBio/goquery"
"github.com/parnurzeal/gorequest"
"strings"
)
type SubResult struct {
Title string
Link string
}
type SubResults []SubResult
func (c *Client) FindSubs(keyword string) (SubResults, []error) {
// 构建咨询字符串
target, err := urlbuilder.New(BaseURL.String()).AddPath(
`/torrents_list_ajax.php`,
).AddQueries(
fmt.Sprintf(
"cat=16&mod=torrents&search=%s", keyword,
),
).Get()
if err != nil {
panic(err)
}
// 发送网络请求
req := gorequest.New().Get(target.String()).Set(`Cookie`, c.Cookies).Set(
`User-Agent`, AgentChrome,
)
if Proxy != nil {
req.Proxy(Proxy.String())
}
resp, body, errs := req.End()
if errs != nil {
return nil, errs
}
if resp.StatusCode != 200 {
return nil, []error{errors.New(`server error`)}
}
// 解析数据
htmlBody := fmt.Sprintf(`<html><body>%s</body></html>`, body)
var results SubResults
doc, _ := goquery.NewDocumentFromReader(bytes.NewReader([]byte(htmlBody)))
doc.Find(`.embedded.torrent_title > a:first-child`).Each(
func(i int, s *goquery.Selection) {
title, _ := s.Attr(`title`)
link, _ := s.Attr(`href`)
results = append(
results, SubResult{
Title: title,
Link: link,
},
)
},
)
return results, nil
}
func (s SubResults) Filter(studio string) SubResults {
var results SubResults
for _, sub := range s {
if strings.Contains(sub.Title, studio) {
results = append(results, sub)
}
}
return results
}
func (s SubResult) Download() error {
// TODO
return nil
}

77
net/utils/crawler/driver.go

@ -0,0 +1,77 @@
package crawler
import (
"fmt"
"github.com/tebeka/selenium"
"github.com/tebeka/selenium/chrome"
)
const (
Stopped State = iota
Runing
)
type State int
type Crawler struct {
Service *selenium.Service
count int
headless bool
port int
state State
}
func Init(chromeDriverPath string, Port int, headless bool) (*Crawler, error) {
opts := []selenium.ServiceOption{
selenium.ChromeDriver(chromeDriverPath), // 指定 ChromeDriver 路径
selenium.Output(nil), // 输出日志到 os.Stderr
}
service, err := selenium.NewChromeDriverService(
chromeDriverPath,
Port, opts...,
)
if err != nil {
return nil, err
}
crawler := &Crawler{
Service: service,
count: 0,
headless: headless,
port: Port,
state: Runing,
}
return crawler, nil
}
func (c *Crawler) Stop() {
if c.state != Runing {
return
}
c.Service.Stop()
c.state = Stopped
}
func (c *Crawler) CreateInstance(url string) (
driver selenium.WebDriver, err error,
) {
caps := selenium.Capabilities{"browserName": "chrome"}
chromeCaps := chrome.Capabilities{}
if c.headless {
chromeCaps.Args = []string{
"--headless",
"--no-sandbox",
"--disable-gpu",
}
}
caps.AddChrome(chromeCaps)
driver, err = selenium.NewRemote(
caps,
fmt.Sprintf("http://localhost:%d/wd/hub", c.port),
)
if err != nil {
return nil, err
}
err = driver.Get(url)
if err != nil {
return nil, err
}
return driver, nil
}
Loading…
Cancel
Save