Browse Source

fix scraping errors

fix-broken-scrape
Jørgen Lien Sellæg 6 years ago
parent
commit
6c0613ebab
  1. 9
      src/logic.js
  2. 50
      src/scrape.js

9
src/logic.js

@ -237,7 +237,10 @@ const register_page_scraper = (endpoint, page) => {
} }
const past_events = get_past_events_from_page(json); const past_events = get_past_events_from_page(json);
if (past_events !== null) { if (past_events !== null) {
if (!past_events.page_info.has_next_page) { if (
!past_events.page_info.has_next_page &&
!upcoming_events.page_info.has_next_page
) {
resolve(responses); resolve(responses);
} }
} }
@ -249,7 +252,7 @@ const register_page_scraper = (endpoint, page) => {
}); });
}; };
const get_page_events = async (browser, page_id) => { const get_page_events = async (browser, page_id, past_events) => {
let scraping = true; let scraping = true;
const facebook_page = await browser.newPage(); const facebook_page = await browser.newPage();
const upcoming_events = register_page_scraper(graphql_endpoint, facebook_page) const upcoming_events = register_page_scraper(graphql_endpoint, facebook_page)
@ -265,6 +268,7 @@ const get_page_events = async (browser, page_id) => {
await facebook_page.goto(page_id); await facebook_page.goto(page_id);
while (scraping) { while (scraping) {
await facebook_page.waitFor(2000);
await facebook_page.evaluate(() => window.scrollBy(0, window.innerHeight)); await facebook_page.evaluate(() => window.scrollBy(0, window.innerHeight));
} }
@ -283,4 +287,5 @@ module.exports = {
parse_args, parse_args,
read_previous_events, read_previous_events,
get_page_events, get_page_events,
merge_edges,
}; };

50
src/scrape.js

@ -6,6 +6,7 @@ const {
open_browser, open_browser,
parse_args, parse_args,
read_previous_events, read_previous_events,
merge_edges,
} = require('./logic'); } = require('./logic');
const { page_ids, output, events: event_file } = parse_args( const { page_ids, output, events: event_file } = parse_args(
@ -22,7 +23,7 @@ const { page_ids, output, events: event_file } = parse_args(
for (let page_id of page_ids) { for (let page_id of page_ids) {
const past_events = false; const past_events = false;
const new_events = get_page_events(browser, page_id, past_events); const new_events = await get_page_events(browser, page_id, past_events);
events = merge_edges(new_events, events); events = merge_edges(new_events, events);
events = events.filter( events = events.filter(
@ -32,31 +33,30 @@ const { page_ids, output, events: event_file } = parse_args(
) === undefined, ) === undefined,
); );
events = await Promise.all( /* events = await Promise.all(
events.map(async (event) => { * events.map(async (event) => {
const event_page = await browser.newPage(); * const event_page = await browser.newPage();
const event_data = await load_event(event_page, event.event_id); * const event_data = await load_event(event_page, event.event_id);
event_page.close(); * event_page.close();
return { * return {
...event_data, * ...event_data,
...event, * ...event,
}; * };
}), * }),
); * ); */
} }
events = await Promise.all( /* events = await Promise.all(
events.map(async (event) => { * events.map(async (event) => {
const images = await save_images(event); * const images = await save_images(event);
delete event.image; * delete event.image;
return { * return {
images, * images,
...event, * ...event,
}; * };
}), * }),
); * ); */
if (output === null) {
const all_events = [...events, ...previous_events] const all_events = [...events, ...previous_events]
.map((event) => { .map((event) => {
const start = pathOr(null, ['date', 'start'], event); const start = pathOr(null, ['date', 'start'], event);
@ -82,8 +82,8 @@ const { page_ids, output, events: event_file } = parse_args(
return -1; return -1;
}); });
if (output === null) {
console.log(JSON.stringify(all_events)); console.log(JSON.stringify(all_events));
}
process.exit(); process.exit();
}
})(); })();

Loading…
Cancel
Save