diff --git a/src/browse.js b/src/browse.js new file mode 100644 index 0000000..ca35e88 --- /dev/null +++ b/src/browse.js @@ -0,0 +1,142 @@ +const puppeteer = require('puppeteer'); + +const { + has_past_events, + get_body_inner_text, + has_upcoming_events, + get_upcoming_events_from_page, + get_past_events_from_page, + map_event, +} = require('./logic'); +const { graphql_endpoint } = require('./constants'); + +const open_browser = async ({ headless }) => { + const browser = await puppeteer.launch({ + headless, + args: ['--disable-dev-shm-usage'], + }); + return browser; +}; + +const register_page_scraper = (endpoint, page, past_events = false) => { + let responses = []; + return new Promise((resolve, reject) => { + page.on('response', async (response) => { + if (endpoint === response.request().url()) { + let json = {}; + try { + json = await response.json(); + } catch (error) { + return responses; + } + + const getters = { + upcoming: get_upcoming_events_from_page, + past: get_past_events_from_page, + }; + + const events = getters[past_events ? 'past' : 'upcoming'](json); + if (events !== null) { + responses = [events, ...responses]; + if (!events.page_info.has_next_page) { + resolve(responses); + } + } + } + }); + }); +}; + +const get_page_events = async ( + browser, + page_id, + get_upcoming_events = true, + get_past_events = false, +) => { + const facebook_page = await browser.newPage(); + + let past_events = []; + let upcoming_events = []; + + let scraping_past_events = false; + let scraping_upcoming_events = false; + + if (get_past_events) { + scraping_past_events = true; + past_events = register_page_scraper(graphql_endpoint, facebook_page, true) + .then((past_events) => { + scraping_past_events = false; + return past_events; + }) + .catch((err) => { + console.error(err); + scraping_past_events = false; + return []; + }); + } else { + past_events = Promise.resolve([]); + } + if (get_upcoming_events) { + scraping_upcoming_events = true; + upcoming_events = register_page_scraper(graphql_endpoint, facebook_page) + .then((upcoming_events) => { + scraping_upcoming_events = false; + return upcoming_events; + }) + .catch((err) => { + console.error(err); + scraping_upcoming_events = false; + return []; + }); + } else { + upcoming_events = Promise.resolve([]); + } + + await facebook_page.goto(page_id); + await facebook_page.waitFor(2000); + const accept_buttons = await facebook_page.$x( + "//button[contains(text(), 'Accept All')]", + ); + if (accept_buttons.length > 0) { + accept_buttons[0].click(); + } + + const body_text = (await get_body_inner_text(facebook_page)).toLowerCase(); + const past_resolved = get_past_events && !has_past_events(body_text); + const upcoming_resolved = + get_upcoming_events && !has_upcoming_events(body_text); + + if (past_resolved) { + past_events = Promise.resolve([]); + scraping_past_events = false; + } + + if (upcoming_resolved) { + upcoming_events = Promise.resolve([]); + scraping_upcoming_events = false; + } + + while (scraping_past_events || scraping_upcoming_events) { + await facebook_page.waitFor(1000); + await facebook_page.evaluate(() => window.scrollBy(0, window.innerHeight)); + if (past_resolved && upcoming_resolved) { + break; + } + } + + upcoming_events = await upcoming_events; + past_events = await past_events; + + const responses = [...upcoming_events, ...past_events]; + const nodes = responses.reduce( + (res, current) => [...res, ...current.edges], + [], + ); + + return nodes.map(map_event); +}; + +module.exports = { + get_page_events, + open_browser, +}; diff --git a/src/logic.js b/src/logic.js index f744810..6cd4793 100644 --- a/src/logic.js +++ b/src/logic.js @@ -4,13 +4,10 @@ const process = require('process'); const page_url = (page_id) => `https://www.facebook.com/${page_id}`; const page_events_url = (page_id) => page_url(page_id) + '/events/'; -const { graphql_endpoint } = require('./constants'); const fs = require('fs').promises; const filesystem = require('fs'); -const puppeteer = require('puppeteer'); - const flatten_string = (page_id) => { if (page_id.startsWith('"') && page_id.endsWith('"')) { return page_id.slice(1, page_id.length - 1); @@ -132,14 +129,6 @@ const map_event = ({ node: event }) => { }; }; -const open_browser = async ({ headless }) => { - const browser = await puppeteer.launch({ - headless, - args: ['--disable-dev-shm-usage'], - }); - return browser; -}; - const get_body_inner_text = async (page) => await page.evaluate('document.querySelector("body").innerText;'); @@ -150,128 +139,14 @@ const has_upcoming_events = (body) => const has_past_events = (body) => body.includes('past events') && !body.includes('not have any past events'); -const register_page_scraper = (endpoint, page, past_events = false) => { - let responses = []; - return new Promise((resolve, reject) => { - page.on('response', async (response) => { - if (endpoint === response.request().url()) { - let json = {}; - try { - json = await response.json(); - } catch (error) { - return responses; - } - - const getters = { - upcoming: get_upcoming_events_from_page, - past: get_past_events_from_page, - }; - - const events = getters[past_events ? 'past' : 'upcoming'](json); - if (events !== null) { - responses = [events, ...responses]; - if (!events.page_info.has_next_page) { - resolve(responses); - } - } - } - }); - }); -}; - -const get_page_events = async ( - browser, - page_id, - get_upcoming_events = true, - get_past_events = false, -) => { - const facebook_page = await browser.newPage(); - - let past_events = []; - let upcoming_events = []; - - let scraping_past_events = false; - let scraping_upcoming_events = false; - - if (get_past_events) { - scraping_past_events = true; - past_events = register_page_scraper(graphql_endpoint, facebook_page, true) - .then((past_events) => { - scraping_past_events = false; - return past_events; - }) - .catch((err) => { - console.error(err); - scraping_past_events = false; - return []; - }); - } else { - past_events = Promise.resolve([]); - } - if (get_upcoming_events) { - scraping_upcoming_events = true; - upcoming_events = register_page_scraper(graphql_endpoint, facebook_page) - .then((upcoming_events) => { - scraping_upcoming_events = false; - return upcoming_events; - }) - .catch((err) => { - console.error(err); - scraping_upcoming_events = false; - return []; - }); - } else { - upcoming_events = Promise.resolve([]); - } - - await facebook_page.goto(page_id); - await facebook_page.waitFor(2000); - const accept_buttons = await facebook_page.$x( - "//button[contains(text(), 'Accept All')]", - ); - if (accept_buttons.length > 0) { - accept_buttons[0].click(); - } - - const body_text = (await get_body_inner_text(facebook_page)).toLowerCase(); - const past_resolved = get_past_events && !has_past_events(body_text); - const upcoming_resolved = - get_upcoming_events && !has_upcoming_events(body_text); - - if (past_resolved) { - past_events = Promise.resolve([]); - scraping_past_events = false; - } - - if (upcoming_resolved) { - upcoming_events = Promise.resolve([]); - scraping_upcoming_events = false; - } - - while (scraping_past_events || scraping_upcoming_events) { - await facebook_page.waitFor(1000); - await facebook_page.evaluate(() => window.scrollBy(0, window.innerHeight)); - if (past_resolved && upcoming_resolved) { - break; - } - } - - upcoming_events = await upcoming_events; - past_events = await past_events; - - const responses = [...upcoming_events, ...past_events]; - const nodes = responses.reduce( - (res, current) => [...res, ...current.edges], - [], - ); - - return nodes.map(map_event); -}; - module.exports = { - get_page_events, + has_past_events, + get_body_inner_text, + has_upcoming_events, + get_upcoming_events_from_page, + get_past_events_from_page, + map_event, merge_edges, - open_browser, parse_args, read_previous_events, }; diff --git a/src/scrape.js b/src/scrape.js index 5c2cb33..f483d65 100644 --- a/src/scrape.js +++ b/src/scrape.js @@ -1,12 +1,6 @@ const { pathOr, uniqBy, eqBy, prop, union } = require('ramda'); - -const { - get_page_events, - merge_edges, - open_browser, - parse_args, - read_previous_events, -} = require('./logic'); +const { merge_edges, parse_args, read_previous_events } = require('./logic'); +const { open_browser, get_page_events } = require('./browser'); const { events: event_file,