Browse Source

sort of crash free now

fix-broken-scrape
Jørgen Lien Sellæg 6 years ago
parent
commit
df59427fbc
  1. 17
      src/browse.js
  2. 27
      src/scrape.js

17
src/browse.js

@ -47,12 +47,8 @@ const register_page_scraper = (endpoint, page, past_events = false) => {
}); });
}; };
const get_page_events = async ( const get_page_events = async (page_id, opt) => {
browser, const browser = await open_browser(opt);
page_id,
get_upcoming_events = true,
get_past_events = false,
) => {
const facebook_page = await browser.newPage(); const facebook_page = await browser.newPage();
let past_events = []; let past_events = [];
@ -61,7 +57,7 @@ const get_page_events = async (
let scraping_past_events = false; let scraping_past_events = false;
let scraping_upcoming_events = false; let scraping_upcoming_events = false;
if (get_past_events) { if (opt.get_past_events) {
scraping_past_events = true; scraping_past_events = true;
past_events = register_page_scraper(graphql_endpoint, facebook_page, true) past_events = register_page_scraper(graphql_endpoint, facebook_page, true)
.then((past_events) => { .then((past_events) => {
@ -76,7 +72,7 @@ const get_page_events = async (
} else { } else {
past_events = Promise.resolve([]); past_events = Promise.resolve([]);
} }
if (get_upcoming_events) { if (opt.get_upcoming_events) {
scraping_upcoming_events = true; scraping_upcoming_events = true;
upcoming_events = register_page_scraper(graphql_endpoint, facebook_page) upcoming_events = register_page_scraper(graphql_endpoint, facebook_page)
.then((upcoming_events) => { .then((upcoming_events) => {
@ -102,9 +98,9 @@ const get_page_events = async (
} }
const body_text = (await get_body_inner_text(facebook_page)).toLowerCase(); const body_text = (await get_body_inner_text(facebook_page)).toLowerCase();
const past_resolved = get_past_events && !has_past_events(body_text); const past_resolved = opt.get_past_events && !has_past_events(body_text);
const upcoming_resolved = const upcoming_resolved =
get_upcoming_events && !has_upcoming_events(body_text); opt.get_upcoming_events && !has_upcoming_events(body_text);
if (past_resolved) { if (past_resolved) {
past_events = Promise.resolve([]); past_events = Promise.resolve([]);
@ -138,5 +134,4 @@ const get_page_events = async (
module.exports = { module.exports = {
get_page_events, get_page_events,
open_browser,
}; };

27
src/scrape.js

@ -6,40 +6,27 @@ const {
to_unique_events, to_unique_events,
} = require('./logic'); } = require('./logic');
const { open_browser, get_page_events } = require('./browser'); const { get_page_events } = require('./browse');
const { const options = parse_args(process.argv.slice(2));
events: event_file,
get_past_events,
get_upcoming_events,
output,
page_id,
headless,
} = parse_args(process.argv.slice(2));
(async () => { (async () => {
let events = []; let events = [];
let prev_events = [];
try { try {
const browser = await open_browser({ headless }); events = await get_page_events(options);
events = await get_page_events( prev_events = await read_previous_events(options.events);
browser,
page_id,
get_upcoming_events,
get_past_events,
);
} catch (e) { } catch (e) {
console.error(e); console.error(e);
} }
const previous_events = await read_previous_events(event_file);
let all_events = events let all_events = events
.reduce(to_unique_events, previous_events) .reduce(to_unique_events, prev_events)
.map(event_date_to_date_obj) .map(event_date_to_date_obj)
.sort(by_date); .sort(by_date);
if (output === null) { if (options.output === null) {
console.log(JSON.stringify(all_events)); console.log(JSON.stringify(all_events));
process.exit(); process.exit();
} }

Loading…
Cancel
Save