fin facebook crawler

This commit is contained in:
jay 2019-03-10 20:04:20 +08:00
parent 2c6ec585d0
commit 098a51671b
3 changed files with 177 additions and 20 deletions

View File

@ -4,6 +4,6 @@ COPY . .
RUN apk add --no-cache git make && \
go build -o crawler .
FROM geckodriver:latest
FROM docker.mtfos.xyz/mtfos/geckodriver-alpine:latest
COPY --from=builder /data/crawler /usr/bin/crawler
CMD ["/usr/bin/crawler"]

21
main.go
View File

@ -4,7 +4,7 @@ import (
"fmt"
"log"
"github.com/tebeka/selenium"
"git.trj.tw/golang/go-crawler/modules/facebook"
"git.trj.tw/golang/go-crawler/modules/browser"
)
@ -21,22 +21,5 @@ func main() {
log.Fatal(err)
}
defer func() { browser.StopService() }()
wd, err := browser.NewWD()
if err != nil {
log.Fatal(err)
}
defer wd.Quit()
err = wd.Get("https://tools.trj.tw")
if err != nil {
log.Fatal(err)
}
el, err := wd.FindElement(selenium.ByCSSSelector, "body")
if err != nil {
log.Fatal(err)
}
fmt.Println(el.Text())
fmt.Println(facebook.GetLastPost("MTFoS-404079100096306"))
}

View File

@ -0,0 +1,174 @@
package facebook
import (
"errors"
"fmt"
"net/url"
"regexp"
"sort"
"strconv"
"strings"
"git.trj.tw/golang/go-crawler/modules/browser"
"github.com/tebeka/selenium"
)
var fbURL = "https://www.facebook.com"
// PostInfo -
type PostInfo struct {
Text string
ID string
Link string
Time int64
}
// ByPostInfo -
type ByPostInfo []PostInfo
// Len -
func (p ByPostInfo) Len() int { return len(p) }
// Swap -
func (p ByPostInfo) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
// Less -
func (p ByPostInfo) Less(i, j int) bool { return p[i].Time < p[j].Time }
var (
idRegexps []*regexp.Regexp
qsRegexp *regexp.Regexp
)
func init() {
idRegexps = make([]*regexp.Regexp, 0, 4)
idRegexps = append(idRegexps,
regexp.MustCompile("[\\?|&]story_fbid\\=(\\d+)"),
regexp.MustCompile("\\/posts\\/(\\d+)"),
regexp.MustCompile("\\/photos\\/.+?\\/(\\d+)"),
regexp.MustCompile("\\/videos\\/(\\d+)"))
qsRegexp = regexp.MustCompile("id")
}
// GetLastPost -
func GetLastPost(page string) (info *PostInfo, err error) {
if len(page) == 0 {
return nil, errors.New("page id is empty")
}
pageURL, err := url.Parse(fbURL)
if err != nil {
return nil, err
}
pageURL, err = pageURL.Parse(fmt.Sprintf("/%s/posts", page))
wd, err := browser.NewWD()
if err != nil {
return nil, err
}
defer wd.Quit()
err = wd.Get(pageURL.String())
if err != nil {
return nil, err
}
elements, err := wd.FindElements(selenium.ByCSSSelector, "div.userContentWrapper")
if err != nil {
return nil, err
}
_ = elements
posts := make([]PostInfo, 0, len(elements))
for _, el := range elements {
post := PostInfo{}
// get timestamp
timeEl, err := el.FindElement(selenium.ByTagName, "abbr")
if err != nil {
continue
}
timeStr, err := timeEl.GetAttribute("data-utime")
if err != nil {
continue
}
timestamp, err := strconv.Atoi(timeStr)
if err != nil {
continue
}
post.Time = int64(timestamp)
// get link element
linkEl, err := timeEl.FindElement(selenium.ByXPATH, "..")
if err != nil {
continue
}
post.Link, err = linkEl.GetAttribute("href")
if err != nil {
continue
}
// get post content
postEl, err := el.FindElement(selenium.ByCSSSelector, "div.userContent")
if err != nil {
continue
}
post.Text, err = postEl.Text()
if err != nil {
continue
}
// get post id
postChile, err := postEl.FindElement(selenium.ByXPATH, "./*")
if err != nil {
continue
}
id, err := postChile.GetAttribute("id")
if err != nil || len(id) == 0 {
for _, regex := range idRegexps {
if regex.Match([]byte(post.Link)) {
strs := regex.FindAllStringSubmatch(post.Link, -1)
if len(strs) > 0 && len(strs[0]) > 1 {
id = strs[0][1]
break
}
}
}
}
posts = append(posts, post)
}
sort.Slice(posts, func(i, j int) bool {
return posts[i].Time > posts[j].Time
})
info = &posts[0]
// remove qs
urls := strings.Split(info.Link, "?")
if len(urls) > 1 {
qsStr := strings.Split(urls[1], "&")
if len(qsStr) > 0 {
qs := make([]string, 0)
for _, val := range qsStr {
items := strings.SplitN(val, "=", 1)
if len(items) > 0 {
if qsRegexp.MatchString(items[0]) {
qs = append(qs, val)
}
}
}
if len(qs) > 0 {
info.Link = urls[0] + "?" + strings.Join(qs, "&")
}
}
}
info.Link = fbURL + info.Link
return info, nil
}