This commit is contained in:
Jay 2019-05-22 23:28:20 +08:00
commit 2c1c61fb48
9 changed files with 2074 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
.env
node_modules/

4
config/index.js Normal file
View File

@ -0,0 +1,4 @@
module.exports = {
api_url: process.env.API_URL || '',
api_key: process.env.API_KEY || ''
}

44
index.js Normal file
View File

@ -0,0 +1,44 @@
require('dotenv').config()
const cron = require('cron')
const apis = require('./libs/apis.js')
const crawler = require('./libs/crawler.js')
const runIGLook = async () => {
let ids = await apis.GetInstagramIDs()
console.log('all ids :: ', ids)
if (ids === null || !Array.isArray(ids) || ids.length === 0) return
ids.forEach(t => { getPost(t) })
}
const getPost = async (id) => {
console.log('get instagram :: ', id)
let post = await crawler.getLastPost(id)
console.log(`get post data ::: ${post.id} / ${post.text} / ${post.link} / ${post.timestamp}`)
let minTime = Math.floor(Date.now() / 1000) - 1800
if (minTime > post.timestamp) return
let data = {
id,
post_id: post.id,
text: post.text,
link: post.link
}
await apis.SendPostData([data])
}
// set fblook
new cron.CronJob({ //eslint-disable-line
cronTime: '00 */2 * * * *',
onTick: async () => {
console.log('Start Tick')
try {
await runIGLook()
} catch (err) {
console.log('run tick fail', err)
}
},
runOnInit: true,
start: true,
timeZone: 'Asia/Taipei'
})

36
init.d/iglook Normal file
View File

@ -0,0 +1,36 @@
#!/sbin/openrc-run
DIRECTORY=/opt/iglook
PIDFILE=/var/run/iglook.pid
depend() {
need net
}
start(){
ebegin "start iglook"
start-stop-daemon \
--start \
-d "${DIRECTORY}" \
-1 /var/log/iglook.log \
-2 /var/log/iglook.err \
-m --pidfile ${PIDFILE} \
--background \
--exec /usr/bin/node -- index.js
eend $?
}
stop(){
ebegin "stop iglook"
start-stop-daemon \
--stop \
--pidfile ${PIDFILE} \
-d "${DIRECTORY}" \
eend $?
}
reload(){
ebegin "restart iglook"
kill -HUP ${PIDFILE}
start
eend $?
}

43
libs/apis.js Normal file
View File

@ -0,0 +1,43 @@
const axios = require('axios')
const config = require('../config/index.js')
const baseURL = config.api_url
const apis = {}
module.exports = apis
apis.GetInstagramIDs = async () => {
try {
let result = await axios({
url: '/api/private/ig',
baseURL,
method: 'get',
headers: {
'X-Mtfos-Key': config.api_key
}
})
if (!('data' in result) || typeof result.data !== 'object' || !('list' in result.data) || !Array.isArray(result.data.list)) return null
return result.data.list
} catch (err) { return null }
}
apis.SendPostData = async (posts = []) => {
if (!Array.isArray(posts) || posts.length === 0) return
posts = posts.filter(t => {
return 'id' in t && 'post_id' in t && 'link' in t && 'text' in t
})
if (posts.length === 0) return
try {
await axios({
url: '/api/private/igposts',
baseURL,
method: 'post',
data: {
igs: posts
},
headers: {
'X-Mtfos-Key': config.api_key
}
})
} catch (err) { return null }
}

82
libs/crawler.js Normal file
View File

@ -0,0 +1,82 @@
const axios = require('axios')
const cheerio = require('cheerio')
const SafeEval = require('safe-eval')
const baseURL = 'https://www.instagram.com'
const obj = {}
module.exports = obj
/**
* get post link
* @param {string} shortcode ig post shortcode
*/
const getPostLink = (shortcode = '') => {
if (typeof shortcode !== 'string' || shortcode.trim().length === 0) return ''
return `${baseURL.replace(/\/$/, '')}/p/${shortcode}`
}
/**
* @param {string} data instagram source html
*/
const getInstagramData = (data) => {
// console.log(data)
if (typeof data !== 'string' || data.trim().length === 0) return null
let $ = cheerio.load(data)
let dataArr = []
$('script').each((idx, el) => {
let e = cheerio.load(el)
// console.log(e('script').html())
if (/^window._sharedData/i.test(e('script').html())) {
// console.log('aaa')
dataArr.push(e('script').html())
}
})
// console.log(dataArr)
if (dataArr.length === 0) return null
let context = { window: {} }
SafeEval(dataArr[0], context)
let posts = context.window._sharedData.entry_data.ProfilePage[0].graphql.user.edge_owner_to_timeline_media.edges
let lastPost = posts[0].node
// console.log(JSON.stringify(lastPost, null, 2))
let postData = {}
postData.id = lastPost.id
postData.timestamp = lastPost.taken_at_timestamp
postData.link = getPostLink(lastPost.shortcode)
try {
postData.text = lastPost.edge_media_to_caption.edges.length > 0 ? lastPost.edge_media_to_caption.edges[0].node.text : ''
} catch (err) {
postData.text = ''
}
lastPost = null
posts = null
delete context.window
return postData
}
/**
* @param {string} id ig user page id
*/
obj.getLastPost = async (id) => {
if (typeof id !== 'string' || id.trim().length === 0) return null
let param = {
baseURL,
method: 'get',
url: `/${id}`,
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:68.0) Gecko/20100101 Firefox/68.0'
// 'Referer': 'https://wiki.trj.tw'
}
}
try {
let result = await axios(param)
if (!('data' in result) || typeof result.data !== 'string') return null
let data = getInstagramData(result.data)
return data
} catch (err) {
console.log(err.response.data || 'no response error data')
return null
}
}

1834
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

23
package.json Normal file
View File

@ -0,0 +1,23 @@
{
"name": "node-ig-crawler",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"start": "node index.js",
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"axios": "^0.18.0",
"cheerio": "^1.0.0-rc.3",
"cron": "^1.7.1",
"dotenv": "^8.0.0",
"safe-eval": "^0.4.1"
},
"devDependencies": {
"standard": "^12.0.1"
}
}

6
test.js Normal file
View File

@ -0,0 +1,6 @@
const igCrawler = require('./libs/crawler.js')
;(async () => {
let result = await igCrawler.getLastPost('otakukaze')
console.log(result)
})().then(() => {})