Browse Source

fix scraping errors

fix-broken-scrape
Jørgen Lien Sellæg 6 years ago
parent
commit
6c0613ebab
  1. 9
      src/logic.js
  2. 96
      src/scrape.js

9
src/logic.js

@ -237,7 +237,10 @@ const register_page_scraper = (endpoint, page) => {
} }
const past_events = get_past_events_from_page(json); const past_events = get_past_events_from_page(json);
if (past_events !== null) { if (past_events !== null) {
if (!past_events.page_info.has_next_page) { if (
!past_events.page_info.has_next_page &&
!upcoming_events.page_info.has_next_page
) {
resolve(responses); resolve(responses);
} }
} }
@ -249,7 +252,7 @@ const register_page_scraper = (endpoint, page) => {
}); });
}; };
const get_page_events = async (browser, page_id) => { const get_page_events = async (browser, page_id, past_events) => {
let scraping = true; let scraping = true;
const facebook_page = await browser.newPage(); const facebook_page = await browser.newPage();
const upcoming_events = register_page_scraper(graphql_endpoint, facebook_page) const upcoming_events = register_page_scraper(graphql_endpoint, facebook_page)
@ -265,6 +268,7 @@ const get_page_events = async (browser, page_id) => {
await facebook_page.goto(page_id); await facebook_page.goto(page_id);
while (scraping) { while (scraping) {
await facebook_page.waitFor(2000);
await facebook_page.evaluate(() => window.scrollBy(0, window.innerHeight)); await facebook_page.evaluate(() => window.scrollBy(0, window.innerHeight));
} }
@ -283,4 +287,5 @@ module.exports = {
parse_args, parse_args,
read_previous_events, read_previous_events,
get_page_events, get_page_events,
merge_edges,
}; };

96
src/scrape.js

@ -6,6 +6,7 @@ const {
open_browser, open_browser,
parse_args, parse_args,
read_previous_events, read_previous_events,
merge_edges,
} = require('./logic'); } = require('./logic');
const { page_ids, output, events: event_file } = parse_args( const { page_ids, output, events: event_file } = parse_args(
@ -22,7 +23,7 @@ const { page_ids, output, events: event_file } = parse_args(
for (let page_id of page_ids) { for (let page_id of page_ids) {
const past_events = false; const past_events = false;
const new_events = get_page_events(browser, page_id, past_events); const new_events = await get_page_events(browser, page_id, past_events);
events = merge_edges(new_events, events); events = merge_edges(new_events, events);
events = events.filter( events = events.filter(
@ -32,58 +33,57 @@ const { page_ids, output, events: event_file } = parse_args(
) === undefined, ) === undefined,
); );
events = await Promise.all( /* events = await Promise.all(
events.map(async (event) => { * events.map(async (event) => {
const event_page = await browser.newPage(); * const event_page = await browser.newPage();
const event_data = await load_event(event_page, event.event_id); * const event_data = await load_event(event_page, event.event_id);
event_page.close(); * event_page.close();
return { * return {
...event_data, * ...event_data,
...event, * ...event,
}; * };
}), * }),
); * ); */
} }
events = await Promise.all( /* events = await Promise.all(
events.map(async (event) => { * events.map(async (event) => {
const images = await save_images(event); * const images = await save_images(event);
delete event.image; * delete event.image;
return { * return {
images, * images,
...event, * ...event,
}; * };
}), * }),
); * ); */
if (output === null) { const all_events = [...events, ...previous_events]
const all_events = [...events, ...previous_events] .map((event) => {
.map((event) => { const start = pathOr(null, ['date', 'start'], event);
const start = pathOr(null, ['date', 'start'], event); if (start !== null) {
if (start !== null) { try {
try { event.date.start = new Date(start);
event.date.start = new Date(start); } catch (e) {
} catch (e) { console.error(e);
console.error(e); return event;
return event;
}
}
return event;
})
.sort((a, b) => {
const b_date = b.date.start;
const a_date = a.date.start;
if (a_date > b_date) {
return 1;
} }
if (a_date == b_date) { }
return 0; return event;
} })
return -1; .sort((a, b) => {
}); const b_date = b.date.start;
const a_date = a.date.start;
if (a_date > b_date) {
return 1;
}
if (a_date == b_date) {
return 0;
}
return -1;
});
if (output === null) {
console.log(JSON.stringify(all_events)); console.log(JSON.stringify(all_events));
process.exit();
} }
process.exit();
})(); })();

Loading…
Cancel
Save