From 098a51671b8ee4270f59634d4ce05cbbb04a73b5 Mon Sep 17 00:00:00 2001 From: jay Date: Sun, 10 Mar 2019 20:04:20 +0800 Subject: [PATCH] fin facebook crawler --- Dockerfile | 2 +- main.go | 21 +---- modules/facebook/facebook.go | 174 +++++++++++++++++++++++++++++++++++ 3 files changed, 177 insertions(+), 20 deletions(-) create mode 100644 modules/facebook/facebook.go diff --git a/Dockerfile b/Dockerfile index 58ece17..314ecc7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,6 +4,6 @@ COPY . . RUN apk add --no-cache git make && \ go build -o crawler . -FROM geckodriver:latest +FROM docker.mtfos.xyz/mtfos/geckodriver-alpine:latest COPY --from=builder /data/crawler /usr/bin/crawler CMD ["/usr/bin/crawler"] \ No newline at end of file diff --git a/main.go b/main.go index 53ca31c..7387e3f 100644 --- a/main.go +++ b/main.go @@ -4,7 +4,7 @@ import ( "fmt" "log" - "github.com/tebeka/selenium" + "git.trj.tw/golang/go-crawler/modules/facebook" "git.trj.tw/golang/go-crawler/modules/browser" ) @@ -21,22 +21,5 @@ func main() { log.Fatal(err) } defer func() { browser.StopService() }() - - wd, err := browser.NewWD() - if err != nil { - log.Fatal(err) - } - defer wd.Quit() - - err = wd.Get("https://tools.trj.tw") - if err != nil { - log.Fatal(err) - } - - el, err := wd.FindElement(selenium.ByCSSSelector, "body") - if err != nil { - log.Fatal(err) - } - - fmt.Println(el.Text()) + fmt.Println(facebook.GetLastPost("MTFoS-404079100096306")) } diff --git a/modules/facebook/facebook.go b/modules/facebook/facebook.go new file mode 100644 index 0000000..2d6b310 --- /dev/null +++ b/modules/facebook/facebook.go @@ -0,0 +1,174 @@ +package facebook + +import ( + "errors" + "fmt" + "net/url" + "regexp" + "sort" + "strconv" + "strings" + + "git.trj.tw/golang/go-crawler/modules/browser" + "github.com/tebeka/selenium" +) + +var fbURL = "https://www.facebook.com" + +// PostInfo - +type PostInfo struct { + Text string + ID string + Link string + Time int64 +} + +// ByPostInfo - +type ByPostInfo []PostInfo + +// Len - +func (p ByPostInfo) Len() int { return len(p) } + +// Swap - +func (p ByPostInfo) Swap(i, j int) { p[i], p[j] = p[j], p[i] } + +// Less - +func (p ByPostInfo) Less(i, j int) bool { return p[i].Time < p[j].Time } + +var ( + idRegexps []*regexp.Regexp + qsRegexp *regexp.Regexp +) + +func init() { + idRegexps = make([]*regexp.Regexp, 0, 4) + idRegexps = append(idRegexps, + regexp.MustCompile("[\\?|&]story_fbid\\=(\\d+)"), + regexp.MustCompile("\\/posts\\/(\\d+)"), + regexp.MustCompile("\\/photos\\/.+?\\/(\\d+)"), + regexp.MustCompile("\\/videos\\/(\\d+)")) + + qsRegexp = regexp.MustCompile("id") +} + +// GetLastPost - +func GetLastPost(page string) (info *PostInfo, err error) { + if len(page) == 0 { + return nil, errors.New("page id is empty") + } + + pageURL, err := url.Parse(fbURL) + if err != nil { + return nil, err + } + pageURL, err = pageURL.Parse(fmt.Sprintf("/%s/posts", page)) + + wd, err := browser.NewWD() + if err != nil { + return nil, err + } + defer wd.Quit() + + err = wd.Get(pageURL.String()) + if err != nil { + return nil, err + } + + elements, err := wd.FindElements(selenium.ByCSSSelector, "div.userContentWrapper") + if err != nil { + return nil, err + } + _ = elements + posts := make([]PostInfo, 0, len(elements)) + + for _, el := range elements { + post := PostInfo{} + + // get timestamp + timeEl, err := el.FindElement(selenium.ByTagName, "abbr") + if err != nil { + continue + } + timeStr, err := timeEl.GetAttribute("data-utime") + if err != nil { + continue + } + timestamp, err := strconv.Atoi(timeStr) + if err != nil { + continue + } + post.Time = int64(timestamp) + + // get link element + linkEl, err := timeEl.FindElement(selenium.ByXPATH, "..") + if err != nil { + continue + } + post.Link, err = linkEl.GetAttribute("href") + if err != nil { + continue + } + + // get post content + postEl, err := el.FindElement(selenium.ByCSSSelector, "div.userContent") + if err != nil { + continue + } + post.Text, err = postEl.Text() + if err != nil { + continue + } + + // get post id + postChile, err := postEl.FindElement(selenium.ByXPATH, "./*") + if err != nil { + continue + } + id, err := postChile.GetAttribute("id") + if err != nil || len(id) == 0 { + for _, regex := range idRegexps { + if regex.Match([]byte(post.Link)) { + strs := regex.FindAllStringSubmatch(post.Link, -1) + if len(strs) > 0 && len(strs[0]) > 1 { + id = strs[0][1] + break + } + } + } + } + + posts = append(posts, post) + + } + + sort.Slice(posts, func(i, j int) bool { + return posts[i].Time > posts[j].Time + }) + + info = &posts[0] + + // remove qs + urls := strings.Split(info.Link, "?") + if len(urls) > 1 { + qsStr := strings.Split(urls[1], "&") + if len(qsStr) > 0 { + qs := make([]string, 0) + for _, val := range qsStr { + items := strings.SplitN(val, "=", 1) + if len(items) > 0 { + if qsRegexp.MatchString(items[0]) { + qs = append(qs, val) + } + } + } + + if len(qs) > 0 { + info.Link = urls[0] + "?" + strings.Join(qs, "&") + } + } + } + + info.Link = fbURL + info.Link + + return info, nil +}