diff --git a/src/logic.js b/src/logic.js index 24b36ac..4ff8282 100644 --- a/src/logic.js +++ b/src/logic.js @@ -1,14 +1,25 @@ -const { pathOr, hasPath, props, prop, unionWith, eqBy } = require('ramda'); +const { + eqBy, + hasPath, + maxBy, + pathOr, + prop, + props, + unionWith, +} = require('ramda'); const parseArgs = require('minimist'); const process = require('process'); const event_url = (event_id) => `https://www.facebook.com/events/${event_id}`; const page_url = (page_id) => `https://www.facebook.com/${page_id}`; const page_events_url = (page_id) => page_url(page_id) + '/events/'; +const { graphql_endpoint } = require('./constants'); const fs = require('fs').promises; const filesystem = require('fs'); +const path = require('path'); + const gm = require('gm').subClass({ imageMagick: true }); const puppeteer = require('puppeteer'); @@ -51,12 +62,16 @@ const parse_args = (args) => { }; }; -const get_upcoming_events = pathOr( +const get_upcoming_events_from_page = pathOr( null, 'data.page.upcoming_events'.split('.'), ); -const get_past_events = pathOr(null, 'data.page.past_events'.split('.')); +const get_past_events_from_page = pathOr( + null, + 'data.page.past_events'.split('.'), +); + const merge_edges = unionWith(eqBy(prop('event_id'))); const write_image = (path, image) => @@ -213,14 +228,14 @@ const register_upcoming_events_listener = (endpoint, page) => { if (endpoint === response.request().url()) { try { const json = await response.json(); - const upcoming_events = get_upcoming_events(json); + const upcoming_events = get_upcoming_events_from_page(json); if (upcoming_events !== null) { responses = [upcoming_events, ...responses]; if (!upcoming_events.page_info.has_next_page) { resolve(responses); } } - const past_events = get_past_events(json); + const past_events = get_past_events_from_page(json); if (past_events !== null) { if (!past_events.page_info.has_next_page) { resolve(responses); @@ -234,9 +249,41 @@ const register_upcoming_events_listener = (endpoint, page) => { }); }; +const get_upcoming_events = async (browser, page_id) => { + let scraping = true; + const facebook_page = await browser.newPage(); + const upcoming_events = register_upcoming_events_listener( + graphql_endpoint, + facebook_page, + ) + .then((upcoming_events) => { + scraping = false; + return upcoming_events; + }) + .catch((err) => { + console.error(err); + scraping = false; + }); + + await facebook_page.goto(page_id); + + while (scraping) { + await facebook_page.evaluate(() => window.scrollBy(0, window.innerHeight)); + } + + const responses = await upcoming_events; + const nodes = responses.reduce( + (res, current) => [...res, ...current.edges], + [], + ); + + return nodes.map(map_event); +}; + module.exports = { create_images_directory, open_browser, parse_args, read_previous_events, + get_upcoming_events, }; diff --git a/src/scrape.js b/src/scrape.js index c051a57..1a65d33 100644 --- a/src/scrape.js +++ b/src/scrape.js @@ -1,10 +1,8 @@ -const { pathOr, maxBy } = require('ramda'); -const path = require('path'); -const fs = require('fs').promises; +const { pathOr } = require('ramda'); -const { graphql_endpoint } = require('./constants'); const { create_images_directory, + get_upcoming_events, open_browser, parse_args, read_previous_events, @@ -24,35 +22,7 @@ const { page_ids, output, events: event_file } = parse_args( let events = []; for (let page_id of page_ids) { - let scraping = true; - const facebook_page = await browser.newPage(); - const upcoming_events = register_upcoming_events_listener( - graphql_endpoint, - facebook_page, - ) - .then((upcoming_events) => { - scraping = false; - return upcoming_events; - }) - .catch((err) => { - console.error(err); - scraping = false; - }); - - await facebook_page.goto(page_id); - - while (scraping) { - await facebook_page.evaluate(() => - window.scrollBy(0, window.innerHeight), - ); - } - - const responses = await upcoming_events; - const nodes = responses.reduce( - (res, current) => [...res, ...current.edges], - [], - ); - const new_events = nodes.map(map_event); + const new_events = get_upcoming_events(browser, page_id); events = merge_edges(new_events, events); events = events.filter(