From 32c76062197c5af7378a528fd7b678cb250f3504 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B8rgen=20Sverre=20Lien=20Sell=C3=A6g?= Date: Tue, 21 Jul 2020 12:28:34 +0200 Subject: [PATCH] update scraper to destinguish past and upcoming events --- src/logic.js | 133 +++++++++++++++++++++++++++++++++++++------------- src/scrape.js | 2 +- 2 files changed, 100 insertions(+), 35 deletions(-) diff --git a/src/logic.js b/src/logic.js index e334e08..c2df01f 100644 --- a/src/logic.js +++ b/src/logic.js @@ -215,64 +215,129 @@ const map_event = ({ node: event }) => { const open_browser = async () => { const browser = await puppeteer.launch({ - headless: true, + headless: false, args: ['--disable-dev-shm-usage'], }); return browser; }; -const register_page_scraper = (endpoint, page) => { +const has_upcoming_events = async (page) => + await page.evaluate( + 'let txt = document.querySelector("body").innerText;txt.includes("Upcoming events") && !txt.includes("not have any upcoming events")', + ); + +const has_past_events = async (page) => + await page.evaluate( + 'let inner = document.querySelector("body").innerText;inner.includes("Past events") && !inner.includes("not have any past events")', + ); + +const register_page_scraper = (endpoint, page, past_events = false) => { let responses = []; return new Promise((resolve, reject) => { page.on('response', async (response) => { if (endpoint === response.request().url()) { + let json = {}; try { - const json = await response.json(); - const upcoming_events = get_upcoming_events_from_page(json); - if (upcoming_events !== null) { - responses = [upcoming_events, ...responses]; - if (!upcoming_events.page_info.has_next_page) { - resolve(responses); - } - } - const past_events = get_past_events_from_page(json); - if (past_events !== null) { - if ( - !past_events.page_info.has_next_page && - !upcoming_events.page_info.has_next_page - ) { - resolve(responses); - } + json = await response.json(); + } catch (error) { + return responses; + } + + const getters = { + upcoming: get_upcoming_events_from_page, + past: get_past_events_from_page, + }; + + const events = getters[past_events ? 'past' : 'upcoming'](json); + if (events !== null) { + responses = [events, ...responses]; + if (!events.page_info.has_next_page) { + resolve(responses); } - } catch (err) { - reject(err); } } }); }); }; -const get_page_events = async (browser, page_id, past_events) => { - let scraping = true; +const get_page_events = async ( + browser, + page_id, + get_upcoming_events = true, + get_past_events = false, +) => { const facebook_page = await browser.newPage(); - const upcoming_events = register_page_scraper(graphql_endpoint, facebook_page) - .then((upcoming_events) => { - scraping = false; - return upcoming_events; - }) - .catch((err) => { - console.error(err); - scraping = false; - }); + + let past_events = []; + let upcoming_events = []; + + let scraping_past_events = false; + let scraping_upcoming_events = false; + + if (get_past_events) { + scraping_past_events = true; + + // set turn off timeout + past_events = register_page_scraper(graphql_endpoint, facebook_page, true) + .then((past_events) => { + scraping_past_events = false; + return past_events; + }) + .catch((err) => { + console.error(err); + scraping_past_events = false; + return []; + }); + } else { + past_events = Promise.resolve([]); + } + if (get_upcoming_events) { + scraping_upcoming_events = true; + // set turn off timeout + upcoming_events = register_page_scraper(graphql_endpoint, facebook_page) + .then((upcoming_events) => { + scraping_upcoming_events = false; + return upcoming_events; + }) + .catch((err) => { + console.error(err); + scraping_upcoming_events = false; + return []; + }); + } else { + upcoming_events = Promise.resolve([]); + } await facebook_page.goto(page_id); + await facebook_page.waitFor(2000); + + const past_resolved = + get_past_events && !(await has_past_events(facebook_page)); + const upcoming_resolved = + get_upcoming_events && !(await has_upcoming_events(facebook_page)); + + if (past_resolved) { + past_events = Promise.resolve([]); + scraping_past_events = false; + } + + if (upcoming_resolved) { + upcoming_events = Promise.resolve([]); + scraping_upcoming_events = false; + } - while (scraping) { - await facebook_page.waitFor(2000); + while (scraping_past_events || scraping_upcoming_events) { + await facebook_page.waitFor(1000); await facebook_page.evaluate(() => window.scrollBy(0, window.innerHeight)); + if (past_resolved && upcoming_resolved) { + break; + } } - const responses = await upcoming_events; + upcoming_events = await upcoming_events; + past_events = await past_events; + + const responses = [...upcoming_events, ...past_events]; const nodes = responses.reduce( (res, current) => [...res, ...current.edges], [], diff --git a/src/scrape.js b/src/scrape.js index 2b31632..4117517 100644 --- a/src/scrape.js +++ b/src/scrape.js @@ -57,7 +57,7 @@ const { page_ids, output, events: event_file } = parse_args( * }), * ); */ - const all_events = [...events, ...previous_events] + let all_events = merge_edges(events, previous_events) .map((event) => { const start = pathOr(null, ['date', 'start'], event); if (start !== null) {