From 8c775b22d1632f1d5a17c2aff898a9a269f95946 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B8rgen=20Sverre=20Lien=20Sell=C3=A6g?= Date: Thu, 17 Mar 2022 23:39:51 +0100 Subject: [PATCH] fix scraper --- .eslintrc.js | 5 +- .prettierrc.js | 4 + package.json | 3 +- src/darma.mjs | 179 ++++++++++++++++++ ...cebook-request.js => facebook-request.mjs} | 24 +-- 5 files changed, 200 insertions(+), 15 deletions(-) create mode 100644 .prettierrc.js create mode 100644 src/darma.mjs rename src/{facebook-request.js => facebook-request.mjs} (63%) diff --git a/.eslintrc.js b/.eslintrc.js index 9cbe88b..fe0bb6f 100644 --- a/.eslintrc.js +++ b/.eslintrc.js @@ -3,11 +3,12 @@ module.exports = { browser: true, commonjs: true, es2020: true, - node: true, + node: true }, extends: ['eslint:recommended', 'prettier'], parserOptions: { ecmaVersion: 11, + sourceType: 'module' }, - rules: {}, + rules: {} }; diff --git a/.prettierrc.js b/.prettierrc.js new file mode 100644 index 0000000..f0722fd --- /dev/null +++ b/.prettierrc.js @@ -0,0 +1,4 @@ +module.exports = { + singleQuote: true, + trailingComma: 'none' +}; diff --git a/package.json b/package.json index 0327063..45b6f7b 100644 --- a/package.json +++ b/package.json @@ -17,5 +17,6 @@ "scripts": { "test": "jest", "watch": "jest --watch" - } + }, + "sourceType": "module" } diff --git a/src/darma.mjs b/src/darma.mjs new file mode 100644 index 0000000..8d5e085 --- /dev/null +++ b/src/darma.mjs @@ -0,0 +1,179 @@ +import { hasPath, pathOr, props, last } from 'ramda'; +import { do_request } from './facebook-request.mjs'; +const sleep = (s) => new Promise((res) => setTimeout(res, s * 1000)); + +const doPageEventsTabPastEventsCardRendererQuery = async ({ pageID }) => { + const doc_id = '4421910857857782'; + const renderer_query_result = await do_request(doc_id, { pageID }); + const page = pathOr( + null, + ['data', 'page', 'past_events'], + renderer_query_result + ); + + if (page === null) { + console.error('doPageEventsTabPastEventsCardRendererQuery returned null.'); + } + + return page; +}; + +const doPageEventsTabUpcomingEventsCardRendererQuery = async ({ pageID }) => { + const doc_id = '5182274978466320'; + const renderer_query_result = await do_request(doc_id, { pageID }); + const page = pathOr( + null, + ['data', 'page', 'upcoming_events'], + renderer_query_result + ); + + if (page === null) { + console.error( + 'doPageEventsTabUpcomingEventsCardRendererQuery returned null.' + ); + } + + return page; +}; + +const doPageEventsTabPastEventsCardPaginationQuery = async ({ + pageID, + cursor +}) => { + const doc_id = '6953034388071359'; + let count = 9; + const renderer_query_result = await do_request(doc_id, { + pageID, + cursor, + count + }); + const page = pathOr( + null, + ['data', 'page', 'past_events'], + renderer_query_result + ); + + if (page === null) { + console.error( + 'doPageEventsTabPastEventsCardPaginationQuery returned null.' + ); + } + + return page; +}; + +const doPageEventsTabUpcomingEventsCardPaginationQuery = async ({ + pageID, + cursor +}) => { + const doc_id = '6985622308176123'; + let count = 9; + const renderer_query_result = await do_request(doc_id, { + pageID, + cursor, + count + }); + const page = pathOr( + null, + ['data', 'page', 'upcoming_events'], + renderer_query_result + ); + + if (page === null) { + console.error( + 'doPageEventsTabUpcomingEventsCardPaginationQuery returned null.' + ); + } + + return page; +}; + +// doPageEventsTabPastEventsCardPaginationQuery +const get_page_events = async ({ + pageID, + get_past_events, + get_upcoming_events +}) => { + let past_events = []; + + if (get_past_events) { + const result = await doPageEventsTabPastEventsCardRendererQuery({ pageID }); + + if (result !== null) { + let { has_next_page, end_cursor: cursor } = result.page_info; + + let { edges } = result; + while (has_next_page) { + sleep(2); + const paginationResult = + await doPageEventsTabPastEventsCardPaginationQuery({ + cursor, + pageID + }); + if (paginationResult === null) { + break; + } + edges = [...edges, ...paginationResult.edges]; + has_next_page = paginationResult.page_info.has_next_page; + cursor = paginationResult.page_info.end_cursor; + } + past_events = [...edges]; + } + } + + let upcoming_events = []; + + if (get_upcoming_events) { + const result = await doPageEventsTabUpcomingEventsCardRendererQuery({ + pageID + }); + if (result !== null) { + upcoming_events = []; + let { has_next_page, end_cursor: cursor } = result.page_info; + let { edges } = result; + while (has_next_page) { + sleep(2); + const paginationResult = + await doPageEventsTabUpcomingEventsCardPaginationQuery({ + cursor, + pageID + }); + if (paginationResult === null) { + break; + } + edges = [...edges, ...paginationResult.edges]; + + has_next_page = paginationResult.page_info.has_next_page; + cursor = paginationResult.page_info.end_cursor; + } + upcoming_events = [...edges]; + } + } + + return [...upcoming_events, ...past_events].map(({ node }) => node); +}; + +/// const events = { +/// edges: [ +/// { +/// node: { +/// __typename: "Event", +/// }, +/// cursor: +/// "AQHRC7ZNKEqDS75jWJfLUWromnLVgAOGzVAZE7c7CcKfoEaLCcXFSvhMvoxN8yk_Yq6fFMTWjuHjitD5sE1IzW68sw", +/// }, +/// ], +/// page_info: { +/// has_next_page: true, +/// end_cursor: +/// "AQHRAh7tKZowf3mdyxtYISP1LNVo45rFI8HJ4nT5SuVgl0NBUfZFslx5qy1eba3YXhdjJ-S2vfojcTGF4ygnt_DQiQ", +/// }, +/// }; + +(async () => { + const res = await get_page_events({ + pageID: '149127815110411', + get_upcoming_events: true + }); + console.log(res); +})(); diff --git a/src/facebook-request.js b/src/facebook-request.mjs similarity index 63% rename from src/facebook-request.js rename to src/facebook-request.mjs index 2cb51a2..5218da9 100644 --- a/src/facebook-request.js +++ b/src/facebook-request.mjs @@ -1,21 +1,25 @@ -const fetch = require('node-fetch'); +import fetch from 'node-fetch'; const graphql_endpoint = 'https://www.facebook.com/api/graphql/'; +import * as url from 'url'; -const https_proxy_agent = require('https-proxy-agent'); +import https_proxy_agent from 'https-proxy-agent'; -const do_request = async (doc_id, variables, parse = true) => { +export const do_request = async (doc_id, variables, parse = true) => { const params = new URLSearchParams(); - const agent = new https_proxy_agent('http://10.0.0.210:5566'); + let proxyOpts = url.parse('http://geo.iproyal.com:12323'); + proxyOpts.auth = 'zalox:LQq0b7EZzjhjlnN'; + const agent = new https_proxy_agent(proxyOpts); params.append('doc_id', doc_id); params.append('variables', JSON.stringify(variables)); + console.log(variables); const fetch_options = { headers: { - 'Content-Type': 'application/x-www-form-urlencoded', + 'Content-Type': 'application/x-www-form-urlencoded' }, body: params, method: 'POST', - agent, + agent }; let res = null; @@ -26,12 +30,12 @@ const do_request = async (doc_id, variables, parse = true) => { return null; } + const txt = await res.text(); + if (!res.ok) { return null; } - const txt = await res.text(); - if (parse) { try { res = JSON.parse(txt); @@ -48,7 +52,3 @@ const do_request = async (doc_id, variables, parse = true) => { } return res; }; - -module.exports = { - do_request, -};