Browse Source

change: now updates events already scraped if pased in with the --event option

fix-broken-scrape
Jørgen Lien Sellæg 6 years ago
parent
commit
a8827b999c
  1. 8
      src/logic.js
  2. 36
      src/scrape.js

8
src/logic.js

@ -105,7 +105,12 @@ const get_past_events_from_page = pathOr(
'data.page.past_events'.split('.'), 'data.page.past_events'.split('.'),
); );
const merge_edges = unionWith(eqBy(prop('event_id'))); const merge_edges = (acc, current) => {
return [
...acc.filter((event) => event.event_id !== current.event_id),
current,
];
};
const write_image = (path, image) => const write_image = (path, image) =>
fs.writeFile(path, image, { encoding: null }); fs.writeFile(path, image, { encoding: null });
@ -227,6 +232,7 @@ const load_event = async (page, event_id) => {
return { image }; return { image };
} catch (e) { } catch (e) {
console.error(e); console.error(e);
return {};
} }
}; };

36
src/scrape.js

@ -1,4 +1,4 @@
const { pathOr } = require('ramda'); const { pathOr, uniqBy, eqBy, prop, union } = require('ramda');
const { const {
create_images_directory, create_images_directory,
@ -27,51 +27,45 @@ const {
create_images_directory(image_directory); create_images_directory(image_directory);
} }
const previous_events = await read_previous_events(event_file);
const browser = await open_browser({ headless });
let events = []; let events = [];
const browser = await open_browser({ headless });
for (let page_id of page_ids) { for (let page_id of page_ids) {
let page_events = [];
try { try {
const new_events = await get_page_events( page_events = await get_page_events(
browser, browser,
page_id, page_id,
get_upcoming_events, get_upcoming_events,
get_past_events, get_past_events,
); );
events = merge_edges(new_events, events);
} catch (e) { } catch (e) {
console.error(e); console.error(e);
events = previous_events;
} }
events = events.filter(
({ event_id }) =>
previous_events.find(
(previous_event) => event_id === previous_event.event_id,
) === undefined,
);
if (images) { if (images) {
events = await Promise.all( page_events = await Promise.all(
events.map(async (event) => { page_events.map(async (event) => {
const event_page = await browser.newPage(); const event_page = await browser.newPage();
const { image } = await load_event(event_page, event.event_id); const { image } = await load_event(event_page, event.event_id);
event_page.close(); await event_page.close();
const images = await save_images( const images = await save_images(
image, image,
event.event_id, event.event_id,
image_directory, image_directory,
); );
return { return { images, ...event };
images,
...event,
};
}), }),
); );
} }
events = uniqBy(eqBy(prop('event_id')))(union(events, page_events));
} }
let all_events = merge_edges(events, previous_events) const previous_events = await read_previous_events(event_file);
let all_events = events
.reduce(merge_edges, previous_events)
.map((event) => { .map((event) => {
const start = pathOr(null, ['date', 'start'], event); const start = pathOr(null, ['date', 'start'], event);
if (start !== null) { if (start !== null) {

Loading…
Cancel
Save