const puppeteer = require('puppeteer'); const { pathOr, unionWith, prop, eqBy, maxBy } = require('ramda'); const url = require('url'); const path = require('path'); const fs = require('fs').promises; const filesystem = require('fs'); const gm = require('gm').subClass({ imageMagick: true }); const { graphql_endpoint } = require('./constants'); const { event_url, parse_args } = require('./logic'); const get_upcoming_events = pathOr( null, 'data.page.upcoming_events'.split('.'), ); const get_past_events = pathOr(null, 'data.page.past_events'.split('.')); const merge_edges = unionWith(eqBy(prop('event_id'))); const load_event = async (page, event_id) => { try { const image_data = new Promise((resolve, reject) => { const images = []; page.on('response', async (response) => { const response_url = response.request().url(); const { pathname } = new URL(response_url); const ext = path.extname(pathname); if (ext === '.jpg') { const image = await response.buffer(); images.push(image); } }); page.on('domcontentloaded', async () => { resolve(images); }); }); await page.goto(event_url(event_id)); const images = await image_data; const image = images.reduce((res, image) => maxBy((item) => item.length, res, image), ); return { image }; } catch (e) { console.error(e); } }; const write_image = (path, image) => fs.writeFile(path, image, { encoding: null }); const gm_write = (image, path) => { return new Promise((resolve, reject) => image.write(path, (err) => (!err ? resolve() : reject(err))), ); }; const write_resized = async (image_path, original) => { const image = gm(original); const size = await new Promise((resolve, reject) => { image.size((err, value) => (!err ? resolve(value) : resolve(null))); }); if (size === null) { throw new Error('Could not get image.'); } let { height: y, width: x } = size; if (y % 2 === 1) { y = y + 1; } if (x % 2 === 1) { x = x + 1; } image.resize(x, y); if (y > x) { const z = (y - x) / 2; image.crop(x, x, 0, z); } if (y < x) { const z = (x - y) / 2; image.crop(y, y, z, 0); } return gm_write(image, image_path); }; const save_images = async ({ image = null, event_id }) => { if (image === null) { return []; } const original_path = `./img/${event_id}.jpg`; const resized_path = `./img/${event_id}-square.jpg`; const original = write_image(original_path, image); const resized_square = write_resized(resized_path, image); try { const res = await Promise.all([original, resized_square]); return { original: original_path, square: resized_path }; } catch (err) { console.error(err); return { original: null }; } }; const get_city_name = (event) => pathOr('', 'event_place.city.contextual_name'.split('.'), event); const get_event_host = (event) => pathOr('', 'event_place.contextual_name'.split('.'), event); const edge_to_node = (edge) => edge.node; const map_event = ({ node: event }) => { const ticket_url = pathOr('', ['event_buy_ticket_url'], event); const city = get_city_name(event); const host = get_event_host(event); return { date: event.time_range, name: event.name, event_id: event.id, ticket_url, location: { host: host, location: city, }, }; }; const create_images_directory = (images_directory) => fs.mkdir(images_directory, { recursive: true }).catch(console.error); const open_browser = async (images_directory) => { const browser = await puppeteer.launch({ headless: true, args: ['--disable-dev-shm-usage'], }); return browser; }; const register_upcoming_events_listener = (endpoint, page) => { let responses = []; return new Promise((resolve, reject) => { page.on('response', async (response) => { if (endpoint === response.request().url()) { try { const json = await response.json(); const upcoming_events = get_upcoming_events(json); if (upcoming_events !== null) { responses = [upcoming_events, ...responses]; if (!upcoming_events.page_info.has_next_page) { resolve(responses); } } const past_events = get_past_events(json); if (past_events !== null) { if (!past_events.page_info.has_next_page) { resolve(responses); } } } catch (err) { reject(err); } } }); }); }; const { page_ids, output, events: event_file } = parse_args( process.argv.slice(2), ); const read_previous_events = (path) => { if (path !== null) { return fs .readFile(path, { encoding: 'utf-8' }) .then((content) => JSON.parse(content)) .catch(console.error); } return Promise.resolve([]); }; (async () => { create_images_directory('./img'); const previous_events = await read_previous_events(event_file); const browser = await open_browser(); let events = []; for (let page_id of page_ids) { let scraping = true; const facebook_page = await browser.newPage(); const upcoming_events = register_upcoming_events_listener( graphql_endpoint, facebook_page, ) .then((upcoming_events) => { scraping = false; return upcoming_events; }) .catch((err) => { console.error(err); scraping = false; }); await facebook_page.goto(page_id); while (scraping) { await facebook_page.evaluate(() => window.scrollBy(0, window.innerHeight), ); } const responses = await upcoming_events; const nodes = responses.reduce( (res, current) => [...res, ...current.edges], [], ); const new_events = nodes.map(map_event); events = merge_edges(new_events, events); events = events.filter( ({ event_id }) => previous_events.find( (previous_event) => event_id === previous_event.event_id, ) === undefined, ); events = await Promise.all( events.map(async (event) => { const event_page = await browser.newPage(); const event_data = await load_event(event_page, event.event_id); return { ...event_data, ...event, }; }), ); } events = await Promise.all( events.map(async (event) => { const images = await save_images(event); delete event.image; return { images, ...event, }; }), ); if (output === null) { console.log(JSON.stringify([...events, ...previous_events])); } process.exit(); })();