const puppeteer = require('puppeteer'); const { pathOr, maxBy } = require('ramda'); const path = require('path'); const fs = require('fs').promises; const filesystem = require('fs'); const { graphql_endpoint } = require('./constants'); const { event_url, get_city_name, get_event_host, get_past_events, get_upcoming_events, merge_edges, parse_args, save_images, } = require('./logic'); const load_event = async (page, event_id) => { try { const image_data = new Promise((resolve) => { const images = []; page.on('response', async (response) => { const response_url = response.request().url(); const { pathname } = new URL(response_url); const ext = path.extname(pathname); if (ext === '.jpg') { const image = await response.buffer(); images.push(image); } }); page.on('domcontentloaded', async () => { resolve(images); }); }); await page.goto(event_url(event_id)); const images = await image_data; const image = images.reduce((res, image) => maxBy((item) => item.length, res, image), ); return { image }; } catch (e) { console.error(e); } }; const map_event = ({ node: event }) => { const ticket_url = pathOr('', ['event_buy_ticket_url'], event); const city = get_city_name(event); const host = get_event_host(event); return { date: event.time_range, name: event.name, event_id: event.id, ticket_url, location: { host: host, location: city, }, }; }; const create_images_directory = (images_directory) => fs.mkdir(images_directory, { recursive: true }).catch(console.error); const open_browser = async () => { const browser = await puppeteer.launch({ headless: true, args: ['--disable-dev-shm-usage'], }); return browser; }; const register_upcoming_events_listener = (endpoint, page) => { let responses = []; return new Promise((resolve, reject) => { page.on('response', async (response) => { if (endpoint === response.request().url()) { try { const json = await response.json(); const upcoming_events = get_upcoming_events(json); if (upcoming_events !== null) { responses = [upcoming_events, ...responses]; if (!upcoming_events.page_info.has_next_page) { resolve(responses); } } const past_events = get_past_events(json); if (past_events !== null) { if (!past_events.page_info.has_next_page) { resolve(responses); } } } catch (err) { reject(err); } } }); }); }; const { page_ids, output, events: event_file } = parse_args( process.argv.slice(2), ); const read_previous_events = (path) => { if (path !== null) { if (filesystem.existsSync(path)) { return fs .readFile(path, { encoding: 'utf-8' }) .then((content) => JSON.parse(content)) .catch(console.error); } } return Promise.resolve([]); }; (async () => { create_images_directory('./img'); const previous_events = await read_previous_events(event_file); const browser = await open_browser(); let events = []; for (let page_id of page_ids) { let scraping = true; const facebook_page = await browser.newPage(); const upcoming_events = register_upcoming_events_listener( graphql_endpoint, facebook_page, ) .then((upcoming_events) => { scraping = false; return upcoming_events; }) .catch((err) => { console.error(err); scraping = false; }); await facebook_page.goto(page_id); while (scraping) { await facebook_page.evaluate(() => window.scrollBy(0, window.innerHeight), ); } const responses = await upcoming_events; const nodes = responses.reduce( (res, current) => [...res, ...current.edges], [], ); const new_events = nodes.map(map_event); events = merge_edges(new_events, events); events = events.filter( ({ event_id }) => previous_events.find( (previous_event) => event_id === previous_event.event_id, ) === undefined, ); events = await Promise.all( events.map(async (event) => { const event_page = await browser.newPage(); const event_data = await load_event(event_page, event.event_id); return { ...event_data, ...event, }; }), ); } events = await Promise.all( events.map(async (event) => { const images = await save_images(event); delete event.image; return { images, ...event, }; }), ); if (output === null) { const all_events = [...events, ...previous_events] .map((event) => { const start = pathOr(null, ['date', 'start'], event); if (start !== null) { try { event.date.start = new Date(start); } catch (e) { console.error(e); return event; } } return event; }) .sort((a, b) => { const b_date = b.date.start; const a_date = a.date.start; if (a_date > b_date) { return 1; } if (a_date == b_date) { return 0; } return -1; }); console.log(JSON.stringify(all_events)); } process.exit(); })();