node-iglook/libs/crawler.js
2019-05-22 23:28:20 +08:00

83 lines
2.3 KiB
JavaScript

const axios = require('axios')
const cheerio = require('cheerio')
const SafeEval = require('safe-eval')
const baseURL = 'https://www.instagram.com'
const obj = {}
module.exports = obj
/**
* get post link
* @param {string} shortcode ig post shortcode
*/
const getPostLink = (shortcode = '') => {
if (typeof shortcode !== 'string' || shortcode.trim().length === 0) return ''
return `${baseURL.replace(/\/$/, '')}/p/${shortcode}`
}
/**
* @param {string} data instagram source html
*/
const getInstagramData = (data) => {
// console.log(data)
if (typeof data !== 'string' || data.trim().length === 0) return null
let $ = cheerio.load(data)
let dataArr = []
$('script').each((idx, el) => {
let e = cheerio.load(el)
// console.log(e('script').html())
if (/^window._sharedData/i.test(e('script').html())) {
// console.log('aaa')
dataArr.push(e('script').html())
}
})
// console.log(dataArr)
if (dataArr.length === 0) return null
let context = { window: {} }
SafeEval(dataArr[0], context)
let posts = context.window._sharedData.entry_data.ProfilePage[0].graphql.user.edge_owner_to_timeline_media.edges
let lastPost = posts[0].node
// console.log(JSON.stringify(lastPost, null, 2))
let postData = {}
postData.id = lastPost.id
postData.timestamp = lastPost.taken_at_timestamp
postData.link = getPostLink(lastPost.shortcode)
try {
postData.text = lastPost.edge_media_to_caption.edges.length > 0 ? lastPost.edge_media_to_caption.edges[0].node.text : ''
} catch (err) {
postData.text = ''
}
lastPost = null
posts = null
delete context.window
return postData
}
/**
* @param {string} id ig user page id
*/
obj.getLastPost = async (id) => {
if (typeof id !== 'string' || id.trim().length === 0) return null
let param = {
baseURL,
method: 'get',
url: `/${id}`,
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:68.0) Gecko/20100101 Firefox/68.0'
// 'Referer': 'https://wiki.trj.tw'
}
}
try {
let result = await axios(param)
if (!('data' in result) || typeof result.data !== 'string') return null
let data = getInstagramData(result.data)
return data
} catch (err) {
console.log(err.response.data || 'no response error data')
return null
}
}