From dcd77573ae1fabf3f682b1a04d2175dc2f9e2be3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B8rgen=20Sverre=20Lien=20Sell=C3=A6g?= Date: Fri, 29 May 2020 12:42:00 +0200 Subject: [PATCH] scrape all upcoming events following cursor --- scrape.js | 91 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 47 insertions(+), 44 deletions(-) diff --git a/scrape.js b/scrape.js index ec747d1..4164150 100644 --- a/scrape.js +++ b/scrape.js @@ -17,27 +17,6 @@ const get_upcoming_events = pathOr( const merge_edges = unionWith(eqBy(prop('event_id'))); -const load_page = async (page, event_page) => { - try { - const graphql_data = new Promise((resolve, reject) => { - page.on('response', async (response) => { - if (graphql_endpoint === response.request().url()) { - const text = await response.json(); - const upcoming_events = get_upcoming_events(text); - if (upcoming_events !== null) { - resolve(upcoming_events); - } - } - }); - }); - await page.goto(event_page); - await page.evaluate(() => window.scrollBy(0, window.innerHeight)); - return await graphql_data; - } catch (e) { - console.error(e); - } -}; - const load_event = async (page, event_id) => { try { const image_data = new Promise((resolve, reject) => { @@ -174,6 +153,28 @@ const event_ids = pathOr('', ['event_ids'], argv) .filter((str) => str.length !== 0) .map((event_id) => `https://www.facebook.com/events/${event_id}`); +const register_upcoming_events_listener = (endpoint, page) => { + let responses = []; + return new Promise((resolve, reject) => { + page.on('response', async (response) => { + if (endpoint === response.request().url()) { + try { + const json = await response.json(); + const upcoming_events = get_upcoming_events(json); + if (upcoming_events !== null) { + responses = [upcoming_events, ...responses]; + if (!upcoming_events.page_info.has_next_page) { + resolve(responses); + } + } + } catch (err) { + reject(err); + } + } + }); + }); +}; + (async () => { create_images_directory('./events/img'); @@ -182,31 +183,33 @@ const event_ids = pathOr('', ['event_ids'], argv) let events = []; for (let page_id of page_ids) { + let scraping = true; const facebook_page = await browser.newPage(); - const data = await load_page(facebook_page, page_id); - const edges = data.edges.map(map_event); - events = merge_edges(edges, events); - - events = await Promise.all( - events.map(async (event) => { - const event_page = await browser.newPage(); - const event_data = await load_event(event_page, event.event_id); - return { - ...event_data, - ...event, - }; - }), - ); + const upcoming_events = register_upcoming_events_listener( + graphql_endpoint, + facebook_page, + ) + .then((upcoming_events) => { + scraping = false; + return upcoming_events; + }) + .catch((err) => { + console.error(err); + scraping = false; + }); + + await facebook_page.goto(page_id); + + while (scraping) { + await facebook_page.evaluate(() => + window.scrollBy(0, window.innerHeight), + ); + } - events = await Promise.all( - events.map(async (event) => { - const images = await save_images(event); - delete event.image; - return { - images, - ...event, - }; - }), + const responses = await upcoming_events; + const nodes = responses.reduce( + (res, current) => [...res, ...current.edges], + [], ); } console.log(JSON.stringify(events));