From 6c0613ebab404ec7c0042185471758b23a6d9976 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B8rgen=20Sverre=20Lien=20Sell=C3=A6g?= Date: Fri, 17 Jul 2020 19:04:21 +0200 Subject: [PATCH] fix scraping errors --- src/logic.js | 9 +++-- src/scrape.js | 96 +++++++++++++++++++++++++-------------------------- 2 files changed, 55 insertions(+), 50 deletions(-) diff --git a/src/logic.js b/src/logic.js index 2d4a5c7..e334e08 100644 --- a/src/logic.js +++ b/src/logic.js @@ -237,7 +237,10 @@ const register_page_scraper = (endpoint, page) => { } const past_events = get_past_events_from_page(json); if (past_events !== null) { - if (!past_events.page_info.has_next_page) { + if ( + !past_events.page_info.has_next_page && + !upcoming_events.page_info.has_next_page + ) { resolve(responses); } } @@ -249,7 +252,7 @@ const register_page_scraper = (endpoint, page) => { }); }; -const get_page_events = async (browser, page_id) => { +const get_page_events = async (browser, page_id, past_events) => { let scraping = true; const facebook_page = await browser.newPage(); const upcoming_events = register_page_scraper(graphql_endpoint, facebook_page) @@ -265,6 +268,7 @@ const get_page_events = async (browser, page_id) => { await facebook_page.goto(page_id); while (scraping) { + await facebook_page.waitFor(2000); await facebook_page.evaluate(() => window.scrollBy(0, window.innerHeight)); } @@ -283,4 +287,5 @@ module.exports = { parse_args, read_previous_events, get_page_events, + merge_edges, }; diff --git a/src/scrape.js b/src/scrape.js index 1f776b5..2b31632 100644 --- a/src/scrape.js +++ b/src/scrape.js @@ -6,6 +6,7 @@ const { open_browser, parse_args, read_previous_events, + merge_edges, } = require('./logic'); const { page_ids, output, events: event_file } = parse_args( @@ -22,7 +23,7 @@ const { page_ids, output, events: event_file } = parse_args( for (let page_id of page_ids) { const past_events = false; - const new_events = get_page_events(browser, page_id, past_events); + const new_events = await get_page_events(browser, page_id, past_events); events = merge_edges(new_events, events); events = events.filter( @@ -32,58 +33,57 @@ const { page_ids, output, events: event_file } = parse_args( ) === undefined, ); - events = await Promise.all( - events.map(async (event) => { - const event_page = await browser.newPage(); - const event_data = await load_event(event_page, event.event_id); - event_page.close(); - return { - ...event_data, - ...event, - }; - }), - ); + /* events = await Promise.all( + * events.map(async (event) => { + * const event_page = await browser.newPage(); + * const event_data = await load_event(event_page, event.event_id); + * event_page.close(); + * return { + * ...event_data, + * ...event, + * }; + * }), + * ); */ } - events = await Promise.all( - events.map(async (event) => { - const images = await save_images(event); - delete event.image; - return { - images, - ...event, - }; - }), - ); + /* events = await Promise.all( + * events.map(async (event) => { + * const images = await save_images(event); + * delete event.image; + * return { + * images, + * ...event, + * }; + * }), + * ); */ - if (output === null) { - const all_events = [...events, ...previous_events] - .map((event) => { - const start = pathOr(null, ['date', 'start'], event); - if (start !== null) { - try { - event.date.start = new Date(start); - } catch (e) { - console.error(e); - return event; - } - } - return event; - }) - .sort((a, b) => { - const b_date = b.date.start; - const a_date = a.date.start; - if (a_date > b_date) { - return 1; + const all_events = [...events, ...previous_events] + .map((event) => { + const start = pathOr(null, ['date', 'start'], event); + if (start !== null) { + try { + event.date.start = new Date(start); + } catch (e) { + console.error(e); + return event; } - if (a_date == b_date) { - return 0; - } - return -1; - }); + } + return event; + }) + .sort((a, b) => { + const b_date = b.date.start; + const a_date = a.date.start; + if (a_date > b_date) { + return 1; + } + if (a_date == b_date) { + return 0; + } + return -1; + }); + if (output === null) { console.log(JSON.stringify(all_events)); + process.exit(); } - - process.exit(); })();