Browse Source

move upcoming to logic

fix-broken-scrape
Jørgen Lien Sellæg 6 years ago
parent
commit
ce85df9fb2
  1. 57
      src/logic.js
  2. 36
      src/scrape.js

57
src/logic.js

@ -1,14 +1,25 @@
const { pathOr, hasPath, props, prop, unionWith, eqBy } = require('ramda'); const {
eqBy,
hasPath,
maxBy,
pathOr,
prop,
props,
unionWith,
} = require('ramda');
const parseArgs = require('minimist'); const parseArgs = require('minimist');
const process = require('process'); const process = require('process');
const event_url = (event_id) => `https://www.facebook.com/events/${event_id}`; const event_url = (event_id) => `https://www.facebook.com/events/${event_id}`;
const page_url = (page_id) => `https://www.facebook.com/${page_id}`; const page_url = (page_id) => `https://www.facebook.com/${page_id}`;
const page_events_url = (page_id) => page_url(page_id) + '/events/'; const page_events_url = (page_id) => page_url(page_id) + '/events/';
const { graphql_endpoint } = require('./constants');
const fs = require('fs').promises; const fs = require('fs').promises;
const filesystem = require('fs'); const filesystem = require('fs');
const path = require('path');
const gm = require('gm').subClass({ imageMagick: true }); const gm = require('gm').subClass({ imageMagick: true });
const puppeteer = require('puppeteer'); const puppeteer = require('puppeteer');
@ -51,12 +62,16 @@ const parse_args = (args) => {
}; };
}; };
const get_upcoming_events = pathOr( const get_upcoming_events_from_page = pathOr(
null, null,
'data.page.upcoming_events'.split('.'), 'data.page.upcoming_events'.split('.'),
); );
const get_past_events = pathOr(null, 'data.page.past_events'.split('.')); const get_past_events_from_page = pathOr(
null,
'data.page.past_events'.split('.'),
);
const merge_edges = unionWith(eqBy(prop('event_id'))); const merge_edges = unionWith(eqBy(prop('event_id')));
const write_image = (path, image) => const write_image = (path, image) =>
@ -213,14 +228,14 @@ const register_upcoming_events_listener = (endpoint, page) => {
if (endpoint === response.request().url()) { if (endpoint === response.request().url()) {
try { try {
const json = await response.json(); const json = await response.json();
const upcoming_events = get_upcoming_events(json); const upcoming_events = get_upcoming_events_from_page(json);
if (upcoming_events !== null) { if (upcoming_events !== null) {
responses = [upcoming_events, ...responses]; responses = [upcoming_events, ...responses];
if (!upcoming_events.page_info.has_next_page) { if (!upcoming_events.page_info.has_next_page) {
resolve(responses); resolve(responses);
} }
} }
const past_events = get_past_events(json); const past_events = get_past_events_from_page(json);
if (past_events !== null) { if (past_events !== null) {
if (!past_events.page_info.has_next_page) { if (!past_events.page_info.has_next_page) {
resolve(responses); resolve(responses);
@ -234,9 +249,41 @@ const register_upcoming_events_listener = (endpoint, page) => {
}); });
}; };
const get_upcoming_events = async (browser, page_id) => {
let scraping = true;
const facebook_page = await browser.newPage();
const upcoming_events = register_upcoming_events_listener(
graphql_endpoint,
facebook_page,
)
.then((upcoming_events) => {
scraping = false;
return upcoming_events;
})
.catch((err) => {
console.error(err);
scraping = false;
});
await facebook_page.goto(page_id);
while (scraping) {
await facebook_page.evaluate(() => window.scrollBy(0, window.innerHeight));
}
const responses = await upcoming_events;
const nodes = responses.reduce(
(res, current) => [...res, ...current.edges],
[],
);
return nodes.map(map_event);
};
module.exports = { module.exports = {
create_images_directory, create_images_directory,
open_browser, open_browser,
parse_args, parse_args,
read_previous_events, read_previous_events,
get_upcoming_events,
}; };

36
src/scrape.js

@ -1,10 +1,8 @@
const { pathOr, maxBy } = require('ramda'); const { pathOr } = require('ramda');
const path = require('path');
const fs = require('fs').promises;
const { graphql_endpoint } = require('./constants');
const { const {
create_images_directory, create_images_directory,
get_upcoming_events,
open_browser, open_browser,
parse_args, parse_args,
read_previous_events, read_previous_events,
@ -24,35 +22,7 @@ const { page_ids, output, events: event_file } = parse_args(
let events = []; let events = [];
for (let page_id of page_ids) { for (let page_id of page_ids) {
let scraping = true; const new_events = get_upcoming_events(browser, page_id);
const facebook_page = await browser.newPage();
const upcoming_events = register_upcoming_events_listener(
graphql_endpoint,
facebook_page,
)
.then((upcoming_events) => {
scraping = false;
return upcoming_events;
})
.catch((err) => {
console.error(err);
scraping = false;
});
await facebook_page.goto(page_id);
while (scraping) {
await facebook_page.evaluate(() =>
window.scrollBy(0, window.innerHeight),
);
}
const responses = await upcoming_events;
const nodes = responses.reduce(
(res, current) => [...res, ...current.edges],
[],
);
const new_events = nodes.map(map_event);
events = merge_edges(new_events, events); events = merge_edges(new_events, events);
events = events.filter( events = events.filter(

Loading…
Cancel
Save