175 lines
3.3 KiB
Go
175 lines
3.3 KiB
Go
package facebook
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"net/url"
|
|
"regexp"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"git.trj.tw/golang/go-crawler/modules/browser"
|
|
"github.com/tebeka/selenium"
|
|
)
|
|
|
|
var fbURL = "https://www.facebook.com"
|
|
|
|
// PostInfo -
|
|
type PostInfo struct {
|
|
Text string
|
|
ID string
|
|
Link string
|
|
Time int64
|
|
}
|
|
|
|
// ByPostInfo -
|
|
type ByPostInfo []PostInfo
|
|
|
|
// Len -
|
|
func (p ByPostInfo) Len() int { return len(p) }
|
|
|
|
// Swap -
|
|
func (p ByPostInfo) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
|
|
|
|
// Less -
|
|
func (p ByPostInfo) Less(i, j int) bool { return p[i].Time < p[j].Time }
|
|
|
|
var (
|
|
idRegexps []*regexp.Regexp
|
|
qsRegexp *regexp.Regexp
|
|
)
|
|
|
|
func init() {
|
|
idRegexps = make([]*regexp.Regexp, 0, 4)
|
|
idRegexps = append(idRegexps,
|
|
regexp.MustCompile("[\\?|&]story_fbid\\=(\\d+)"),
|
|
regexp.MustCompile("\\/posts\\/(\\d+)"),
|
|
regexp.MustCompile("\\/photos\\/.+?\\/(\\d+)"),
|
|
regexp.MustCompile("\\/videos\\/(\\d+)"))
|
|
|
|
qsRegexp = regexp.MustCompile("id")
|
|
}
|
|
|
|
// GetLastPost -
|
|
func GetLastPost(page string) (info *PostInfo, err error) {
|
|
if len(page) == 0 {
|
|
return nil, errors.New("page id is empty")
|
|
}
|
|
|
|
pageURL, err := url.Parse(fbURL)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
pageURL, err = pageURL.Parse(fmt.Sprintf("/%s/posts", page))
|
|
|
|
wd, err := browser.NewWD()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer wd.Quit()
|
|
|
|
err = wd.Get(pageURL.String())
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
elements, err := wd.FindElements(selenium.ByCSSSelector, "div.userContentWrapper")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
_ = elements
|
|
posts := make([]PostInfo, 0, len(elements))
|
|
|
|
for _, el := range elements {
|
|
post := PostInfo{}
|
|
|
|
// get timestamp
|
|
timeEl, err := el.FindElement(selenium.ByTagName, "abbr")
|
|
if err != nil {
|
|
continue
|
|
}
|
|
timeStr, err := timeEl.GetAttribute("data-utime")
|
|
if err != nil {
|
|
continue
|
|
}
|
|
timestamp, err := strconv.Atoi(timeStr)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
post.Time = int64(timestamp)
|
|
|
|
// get link element
|
|
linkEl, err := timeEl.FindElement(selenium.ByXPATH, "..")
|
|
if err != nil {
|
|
continue
|
|
}
|
|
post.Link, err = linkEl.GetAttribute("href")
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
// get post content
|
|
postEl, err := el.FindElement(selenium.ByCSSSelector, "div.userContent")
|
|
if err != nil {
|
|
continue
|
|
}
|
|
post.Text, err = postEl.Text()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
// get post id
|
|
postChile, err := postEl.FindElement(selenium.ByXPATH, "./*")
|
|
if err != nil {
|
|
continue
|
|
}
|
|
id, err := postChile.GetAttribute("id")
|
|
if err != nil || len(id) == 0 {
|
|
for _, regex := range idRegexps {
|
|
if regex.Match([]byte(post.Link)) {
|
|
strs := regex.FindAllStringSubmatch(post.Link, -1)
|
|
if len(strs) > 0 && len(strs[0]) > 1 {
|
|
id = strs[0][1]
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
posts = append(posts, post)
|
|
|
|
}
|
|
|
|
sort.Slice(posts, func(i, j int) bool {
|
|
return posts[i].Time > posts[j].Time
|
|
})
|
|
|
|
info = &posts[0]
|
|
|
|
// remove qs
|
|
urls := strings.Split(info.Link, "?")
|
|
if len(urls) > 1 {
|
|
qsStr := strings.Split(urls[1], "&")
|
|
if len(qsStr) > 0 {
|
|
qs := make([]string, 0)
|
|
for _, val := range qsStr {
|
|
items := strings.SplitN(val, "=", 1)
|
|
if len(items) > 0 {
|
|
if qsRegexp.MatchString(items[0]) {
|
|
qs = append(qs, val)
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(qs) > 0 {
|
|
info.Link = urls[0] + "?" + strings.Join(qs, "&")
|
|
}
|
|
}
|
|
}
|
|
|
|
info.Link = fbURL + info.Link
|
|
|
|
return info, nil
|
|
}
|