Browse Source

finish crawler in net/utils

master
realxlfd 3 months ago
parent
commit
61396d3eaf
  1. 2
      README.MD
  2. 2
      net/apis/skysnow/utils.go
  3. 76
      net/apis/ym/utils.go
  4. 28
      net/utils/crawler/driver.go
  5. 416
      net/utils/crawler/parser.go

2
README.MD

@ -18,11 +18,13 @@
- `apis` 一些网络应用程序API的Go绑定
- `git.realxlfd.cc/RealXLFD/net/apis/tmdb` The Movie Database
API的Go绑定,内置了Web爬虫
- `git.realxlfd.cc/RealXLFD/net/apis/` 云码API的Go绑定
- `git.realxlfd.cc/RealXLFD/net/apis/openai` OpenAI API的Go绑定
- `git.realxlfd.cc/RealXLFD/net/apis/skysnow` 天雪站爬虫 (开发中)
- `utils` 网络通用工具
- `git.realxlfd.cc/RealXLFD/net/utils/cookie`
用于反序列化Set-Cookie标头或者序列化Cookie
- `git.realxlfd.cc/RealXLFD/net/utils/crawler` 基于Chrome的Web爬虫⭐
- `git.realxlfd.cc/RealXLFD/net/utils/urlbuilder` URL构建器
- `utils` 一般通用工具

2
net/apis/skysnow/utils.go

@ -153,7 +153,7 @@ func NewClient(account Account, opts ...Option) (*Client, []error) {
if time.Since(start) > timeout {
return nil, []error{errors.New("login timed out")}
}
time.Sleep(500 * time.Millisecond) // 简短暂停再次尝试,以避免过于频繁的查询
time.Sleep(500 * time.Millisecond)
}
var cookies []selenium.Cookie
cookies, err = driver.GetCookies()

76
net/apis/ym/utils.go

@ -0,0 +1,76 @@
package ym
import (
"encoding/json"
"errors"
"fmt"
"github.com/parnurzeal/gorequest"
"net/http"
"net/url"
)
var (
Token = ``
BaseURL = &url.URL{
Host: `api.jfbym.com`,
Scheme: `http`,
Path: `/api/YmServer/customApi`,
}
Proxy = &url.URL{
Host: `127.0.0.1:7890`,
Scheme: `http`,
}
)
type RespCommonVerify struct {
Msg string `json:"msg"`
Code int `json:"code"`
Data struct {
Code int `json:"code"`
Data string `json:"data"`
Time float64 `json:"time"`
Externel int `json:"externel"`
UniqueCode string `json:"unique_code"`
FilePath string `json:"file_path"`
} `json:"data"`
}
func CommonVerify(imgBase64 string, method int) (result string, errs []error) {
if Token == `` {
return ``, []error{errors.New(`token is empty`)}
}
content := fmt.Sprintf(
`image=%s&type=%d&token=%s`, imgBase64,
method, Token,
)
target := BaseURL.String()
req := gorequest.New().Post(target).Send(content).Set(
`Content-Type`, `application/x-www-form-urlencoded`,
)
if Proxy != nil {
req.Proxy(Proxy.String())
}
var resp *http.Response
var body string
resp, body, errs = req.End()
if errs != nil {
return result, errs
}
if resp.StatusCode != 200 {
return result, []error{
fmt.Errorf(
`server error: status code %d`, resp.StatusCode,
),
}
}
v := &RespCommonVerify{}
err := json.Unmarshal([]byte(body), v)
if err != nil {
return result, []error{errors.New(`server can not parse image data`)}
}
if v.Code != 10000 {
return result, []error{fmt.Errorf(`server error: %s`, v.Msg)}
}
result = v.Data.Data
return result, nil
}

28
net/utils/crawler/driver.go

@ -59,6 +59,11 @@ func (c *Crawler) CreateInstance(url string) (
"--headless",
"--no-sandbox",
"--disable-gpu",
"--disable-web-security",
}
} else {
chromeCaps.Args = []string{
"--disable-web-security",
}
}
caps.AddChrome(chromeCaps)
@ -75,3 +80,26 @@ func (c *Crawler) CreateInstance(url string) (
}
return driver, nil
}
func (c *Crawler) GetBlankInstance() (
driver selenium.WebDriver, err error,
) {
caps := selenium.Capabilities{"browserName": "chrome"}
chromeCaps := chrome.Capabilities{}
if c.headless {
chromeCaps.Args = []string{
"--headless",
"--no-sandbox",
"--disable-gpu",
}
}
caps.AddChrome(chromeCaps)
driver, err = selenium.NewRemote(
caps,
fmt.Sprintf("http://localhost:%d/wd/hub", c.port),
)
if err != nil {
return nil, err
}
return driver, nil
}

416
net/utils/crawler/parser.go

@ -0,0 +1,416 @@
package crawler
import (
"errors"
"fmt"
"git.realxlfd.cc/RealXLFD/golib/net/apis/ym"
"git.realxlfd.cc/RealXLFD/golib/net/utils/cookie"
"github.com/tebeka/selenium"
"math"
"net/url"
"regexp"
"strconv"
"strings"
"time"
)
type Mode struct {
driver selenium.WebDriver
executor []func() error
Ctx map[string]string
Selected []selenium.WebElement
}
func (m *Mode) Exec() error {
for _, exec := range m.executor {
if err := exec(); err != nil {
return err
}
}
return nil
}
var (
re = regexp.MustCompile(`\{(.*?)\}`)
)
const (
scriptPre = `var canvas = document.createElement('canvas');
var img = document.querySelector("%s")
var ctx = canvas.getContext('2d');
canvas.width = img.width;
canvas.height = img.height;
ctx.drawImage(img, 0, 0);
return canvas.toDataURL();`
)
func Parse(process string, drive selenium.WebDriver) (*Mode, error) {
var err error
lines := strings.Split(process, "\r\n")
var actions []Action
for _, line := range lines {
if line == "" {
continue
}
var action Action
action, err = parseOperation(line)
if err != nil {
return nil, err
}
actions = append(actions, action)
}
return toMode(actions, drive), nil
}
func toMode(actions []Action, driver selenium.WebDriver) *Mode {
var mode = &Mode{
driver: driver,
Ctx: make(map[string]string),
}
var executor []func() error
for _, action := range actions {
switch action.Act {
case TO:
if action.Dst == "" {
executor = append(
executor, func() error {
if window, ok := mode.Ctx[action.Save]; ok {
return driver.SwitchWindow(window)
}
return fmt.Errorf(
`not found the window: %s`, action.Save,
)
},
)
break
}
executor = append(
executor, func() error {
return driver.Get(action.Dst)
},
)
case SELECT:
executor = append(
executor, func() error {
var err2 error
mode.Selected, err2 = driver.FindElements(
selenium.ByCSSSelector, action.Dst,
)
return err2
},
)
case GET:
executor = append(
executor, func() error {
var err error
if mode.Selected == nil {
return errors.New("no selected element")
}
if strings.ToLower(action.Dst) == "text" {
mode.Ctx[action.Save], err = mode.Selected[0].Text()
return err
}
mode.Ctx[action.Save], err = mode.Selected[0].GetAttribute(
action.Dst,
)
return err
},
)
case CLICK:
executor = append(
executor, func() error {
var err error
if mode.Selected == nil {
return errors.New("no selected element")
}
for _, e := range mode.Selected {
err = e.Click()
if err != nil {
return err
}
}
return nil
},
)
case FILL:
executor = append(
executor, func() error {
var err error
if mode.Selected == nil {
return errors.New("no selected element")
}
for _, e := range mode.Selected {
if action.Save != "" {
action.Dst = mode.Ctx[action.Save]
}
err = e.SendKeys(action.Dst)
if err != nil {
return err
}
}
return nil
},
)
case COOKIE:
executor = append(
executor, func() error {
var err2 error
var cookies []selenium.Cookie
cookies, err2 = driver.GetCookies()
if err2 != nil {
return err2
}
var rawCookie strings.Builder
for _, c := range cookies {
rawCookie.WriteString(c.Name)
rawCookie.WriteString("=")
rawCookie.WriteString(c.Value)
rawCookie.WriteString("; ")
}
mode.Ctx[action.Dst] = rawCookie.String()
return nil
},
)
case WAIT:
executor = append(
executor, func() error {
start := time.Now()
s, err := strconv.Atoi(action.Save)
if err != nil {
return err
}
timeout := time.Duration(s) * time.Second
for {
if time.Since(start) > timeout {
return nil
}
time.Sleep(
200 * time.
Millisecond,
) // 简短暂停再次尝试,以避免过于频繁的查询
if action.Dst == "" {
continue
}
if _, err = driver.FindElement(
selenium.ByCSSSelector,
action.Dst,
); err == nil {
break
}
}
return nil
},
)
case HTML:
executor = append(
executor, func() error {
var err2 error
mode.Ctx[action.Save], err2 = driver.PageSource()
return err2
},
)
case SETCOOKIE:
executor = append(
executor, func() error {
var err error
err = driver.DeleteAllCookies()
if err != nil {
return err
}
cookies := cookie.New(action.Dst)
domain, err := driver.CurrentURL()
if err != nil {
return err
}
u, err := url.Parse(domain)
if err != nil {
panic(err)
}
for k, v := range cookies {
if err = driver.AddCookie(
&selenium.Cookie{
Name: k, Value: v,
Domain: u.Host, Expiry: math.MaxUint32,
Path: u.Path, Secure: false,
},
); err != nil {
return err
}
}
err = driver.Refresh()
return err
},
)
case VERIFY:
executor = append(
executor, func() error {
script := fmt.Sprintf(scriptPre, action.Dst)
data, err := driver.ExecuteScript(script, nil)
if err != nil {
return err
}
if imgBase64, ok := data.(string); ok || imgBase64 == "" {
// 默认为6位或一下数字字母模式
result, errs := ym.CommonVerify(imgBase64, ModeNL8)
if errs != nil {
return errs[0]
}
mode.Ctx[action.Save] = result
return nil
}
return errors.New("can not parse image data")
},
)
case WINDOW:
executor = append(
executor, func() error {
var err error
mode.Ctx[action.Dst], err = driver.CurrentWindowHandle()
return err
},
)
default:
panic("unhandled default case")
}
}
mode.executor = executor
return mode
}
func parseOperation(line string) (actions Action, err error) {
parts := strings.Split(line, " ")
if len(parts) == 0 {
return actions, errors.New("empty line")
}
mark := strings.ToUpper(parts[0])
if mark == "SETCOOKIE" {
matches := re.FindAllStringSubmatch(line, -1)
if len(matches) != 1 || matches[0][1] == "" {
return Action{}, fmt.Errorf("invalid argument: %s", parts[1])
}
return Action{Act: SETCOOKIE, Dst: matches[0][1]}, nil
}
if mark == "SELECT" {
matches := re.FindAllStringSubmatch(line, -1)
cssSelector := strings.Builder{}
for _, match := range matches {
cssSelector.WriteString(match[1])
}
return Action{Act: SELECT, Dst: cssSelector.String()}, nil
}
if mark == "FILL" {
matches := re.FindAllStringSubmatch(line, -1)
if len(matches) == 0 {
after := strings.ReplaceAll(
line, fmt.Sprintf(
`%s `,
parts[0],
), "",
)
if after != "" && after[0] == '.' {
return Action{Act: FILL, Dst: "", Save: after[1:]}, nil
}
return Action{}, fmt.Errorf("invalid argument: %s", line)
}
if len(matches) != 1 || matches[0][1] == "" {
return Action{}, fmt.Errorf("invalid argument: %s", line)
}
return Action{Act: FILL, Dst: matches[0][1]}, nil
}
if mark == "WAIT" {
matches := re.FindAllStringSubmatch(line, -1)
if len(matches) != 2 || matches[1][1] == "" {
return Action{}, fmt.Errorf("invalid argument: %s", parts[1])
}
return Action{Act: WAIT, Dst: matches[0][1], Save: matches[1][1]}, nil
}
if mark == "VERIFY" {
matches := re.FindAllStringSubmatch(line, -1)
if len(matches) != 1 || matches[0][1] == "" {
return Action{}, fmt.Errorf("invalid argument: %s", parts[1])
}
replace := fmt.Sprintf(`%s %s `, parts[0], matches[0][0])
after := strings.ReplaceAll(line, replace, "")
afters := strings.Split(after, " ")
if len(afters) != 2 || afters[0] != "->" {
return Action{}, fmt.Errorf("invalid argument: %s", after)
}
return Action{Act: VERIFY, Dst: matches[0][1], Save: afters[1]}, nil
}
switch len(parts) {
case 1:
switch mark {
case "CLICK":
return Action{Act: CLICK}, nil
default:
return actions, fmt.Errorf("unknown operation: %s", line)
}
case 2:
switch mark {
case "TO":
if strings.HasPrefix(parts[1], ".") {
return Action{
Act: TO, Dst: "", Save: strings.TrimPrefix(parts[1], "."),
}, nil
}
return Action{Act: TO, Dst: parts[1]}, nil
case "HTML":
return Action{Act: GET, Save: parts[1]}, nil
default:
return actions, fmt.Errorf("unknown operation: %s", line)
}
case 3:
switch mark {
case "WINDOW":
if parts[1] == "->" {
return Action{Act: WINDOW, Dst: parts[2]}, nil
}
return actions, fmt.Errorf("unknown operation: %s", line)
case "COOKIE":
if parts[1] == "->" {
return Action{Act: COOKIE, Dst: parts[2]}, nil
}
fallthrough
default:
return actions, fmt.Errorf("unknown operation: %s", line)
}
case 4:
switch mark {
case "GET":
if parts[2] == "->" {
return Action{Act: GET, Dst: parts[1], Save: parts[3]}, nil
}
fallthrough
default:
return actions, fmt.Errorf("unknown operation: %s", line)
}
}
return actions, fmt.Errorf("can not parse line: %s", line)
}
const (
TO Operation = iota
SELECT
GET
CLICK
FILL
COOKIE
WAIT
HTML
SETCOOKIE
VERIFY
WINDOW
)
const (
ModeNLP6 = 10104
ModeNL8 = 10111
)
type VerifyMode int
type Action struct {
Act Operation
Dst string
Save string
}
type Operation int
Loading…
Cancel
Save