Browse Source

fix scraping errors

fix-broken-scrape
Jørgen Lien Sellæg 6 years ago
parent
commit
6c0613ebab
  1. 9
      src/logic.js
  2. 96
      src/scrape.js

9
src/logic.js

@ -237,7 +237,10 @@ const register_page_scraper = (endpoint, page) => {
}
const past_events = get_past_events_from_page(json);
if (past_events !== null) {
if (!past_events.page_info.has_next_page) {
if (
!past_events.page_info.has_next_page &&
!upcoming_events.page_info.has_next_page
) {
resolve(responses);
}
}
@ -249,7 +252,7 @@ const register_page_scraper = (endpoint, page) => {
});
};
const get_page_events = async (browser, page_id) => {
const get_page_events = async (browser, page_id, past_events) => {
let scraping = true;
const facebook_page = await browser.newPage();
const upcoming_events = register_page_scraper(graphql_endpoint, facebook_page)
@ -265,6 +268,7 @@ const get_page_events = async (browser, page_id) => {
await facebook_page.goto(page_id);
while (scraping) {
await facebook_page.waitFor(2000);
await facebook_page.evaluate(() => window.scrollBy(0, window.innerHeight));
}
@ -283,4 +287,5 @@ module.exports = {
parse_args,
read_previous_events,
get_page_events,
merge_edges,
};

96
src/scrape.js

@ -6,6 +6,7 @@ const {
open_browser,
parse_args,
read_previous_events,
merge_edges,
} = require('./logic');
const { page_ids, output, events: event_file } = parse_args(
@ -22,7 +23,7 @@ const { page_ids, output, events: event_file } = parse_args(
for (let page_id of page_ids) {
const past_events = false;
const new_events = get_page_events(browser, page_id, past_events);
const new_events = await get_page_events(browser, page_id, past_events);
events = merge_edges(new_events, events);
events = events.filter(
@ -32,58 +33,57 @@ const { page_ids, output, events: event_file } = parse_args(
) === undefined,
);
events = await Promise.all(
events.map(async (event) => {
const event_page = await browser.newPage();
const event_data = await load_event(event_page, event.event_id);
event_page.close();
return {
...event_data,
...event,
};
}),
);
/* events = await Promise.all(
* events.map(async (event) => {
* const event_page = await browser.newPage();
* const event_data = await load_event(event_page, event.event_id);
* event_page.close();
* return {
* ...event_data,
* ...event,
* };
* }),
* ); */
}
events = await Promise.all(
events.map(async (event) => {
const images = await save_images(event);
delete event.image;
return {
images,
...event,
};
}),
);
/* events = await Promise.all(
* events.map(async (event) => {
* const images = await save_images(event);
* delete event.image;
* return {
* images,
* ...event,
* };
* }),
* ); */
if (output === null) {
const all_events = [...events, ...previous_events]
.map((event) => {
const start = pathOr(null, ['date', 'start'], event);
if (start !== null) {
try {
event.date.start = new Date(start);
} catch (e) {
console.error(e);
return event;
}
}
return event;
})
.sort((a, b) => {
const b_date = b.date.start;
const a_date = a.date.start;
if (a_date > b_date) {
return 1;
const all_events = [...events, ...previous_events]
.map((event) => {
const start = pathOr(null, ['date', 'start'], event);
if (start !== null) {
try {
event.date.start = new Date(start);
} catch (e) {
console.error(e);
return event;
}
if (a_date == b_date) {
return 0;
}
return -1;
});
}
return event;
})
.sort((a, b) => {
const b_date = b.date.start;
const a_date = a.date.start;
if (a_date > b_date) {
return 1;
}
if (a_date == b_date) {
return 0;
}
return -1;
});
if (output === null) {
console.log(JSON.stringify(all_events));
process.exit();
}
process.exit();
})();

Loading…
Cancel
Save