You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
375 lines
9.1 KiB
375 lines
9.1 KiB
const { |
|
eqBy, |
|
hasPath, |
|
maxBy, |
|
pathOr, |
|
prop, |
|
props, |
|
unionWith, |
|
} = require('ramda'); |
|
const parseArgs = require('minimist'); |
|
const process = require('process'); |
|
|
|
const event_url = (event_id) => `https://www.facebook.com/events/${event_id}`; |
|
const page_url = (page_id) => `https://www.facebook.com/${page_id}`; |
|
const page_events_url = (page_id) => page_url(page_id) + '/events/'; |
|
const { graphql_endpoint } = require('./constants'); |
|
|
|
const fs = require('fs').promises; |
|
const filesystem = require('fs'); |
|
|
|
const path = require('path'); |
|
|
|
const gm = require('gm').subClass({ imageMagick: true }); |
|
|
|
const puppeteer = require('puppeteer'); |
|
|
|
const flatten_string = (page_id) => { |
|
if (page_id.startsWith('"') && page_id.endsWith('"')) { |
|
return page_id.slice(1, page_id.length - 1); |
|
} |
|
if (page_id.startsWith("'") && page_id.endsWith("'")) { |
|
return page_id.slice(1, page_id.length - 1); |
|
} |
|
return page_id; |
|
}; |
|
|
|
const parse_output = (argv) => { |
|
let [res = ''] = props(['output', 'o'], argv).filter( |
|
(item) => item !== undefined, |
|
); |
|
|
|
res = flatten_string(res); |
|
|
|
if (res === '') { |
|
res = null; |
|
} |
|
|
|
return res; |
|
}; |
|
|
|
const parse_args = (args) => { |
|
const argv = parseArgs(args); |
|
const has_help_param = |
|
hasPath(['h'], argv) || hasPath(['help'], argv) || hasPath(['?'], argv); |
|
if (has_help_param) { |
|
process.exit(1); |
|
} |
|
|
|
const away_empty_strings = (str) => str.length !== 0; |
|
const page_id_to_page_events_url = page_events_url; |
|
const parse_param = (param) => |
|
flatten_string(pathOr('', [param], argv)) |
|
.split(',') |
|
.filter(away_empty_strings) |
|
.map(page_id_to_page_events_url); |
|
|
|
let events = flatten_string(pathOr('', ['events'], argv)); |
|
if (events === '') { |
|
events = null; |
|
} |
|
const output = parse_output(argv); |
|
|
|
return { |
|
page_ids: [ |
|
...parse_param('page'), |
|
...parse_param('p'), |
|
...parse_param('pages'), |
|
], |
|
events, |
|
output, |
|
}; |
|
}; |
|
|
|
const get_upcoming_events_from_page = pathOr( |
|
null, |
|
'data.page.upcoming_events'.split('.'), |
|
); |
|
|
|
const get_past_events_from_page = pathOr( |
|
null, |
|
'data.page.past_events'.split('.'), |
|
); |
|
|
|
const merge_edges = unionWith(eqBy(prop('event_id'))); |
|
|
|
const write_image = (path, image) => |
|
fs.writeFile(path, image, { encoding: null }); |
|
|
|
const gm_write = (image, path) => { |
|
return new Promise((resolve, reject) => |
|
image.write(path, (err) => (!err ? resolve() : reject(err))), |
|
); |
|
}; |
|
|
|
const write_resized = async (image_path, original) => { |
|
const image = gm(original); |
|
const size = await new Promise((resolve) => { |
|
image.size((err, value) => (!err ? resolve(value) : resolve(null))); |
|
}); |
|
|
|
if (size === null) { |
|
throw new Error('Could not get image.'); |
|
} |
|
|
|
let { height: y, width: x } = size; |
|
|
|
if (y % 2 === 1) { |
|
y = y + 1; |
|
} |
|
|
|
if (x % 2 === 1) { |
|
x = x + 1; |
|
} |
|
|
|
image.resize(x, y); |
|
|
|
if (y > x) { |
|
const z = (y - x) / 2; |
|
image.crop(x, x, 0, z); |
|
} |
|
|
|
if (y < x) { |
|
const z = (x - y) / 2; |
|
image.crop(y, y, z, 0); |
|
} |
|
|
|
return gm_write(image, image_path); |
|
}; |
|
|
|
const save_images = async ({ image = null, event_id }) => { |
|
if (image === null) { |
|
return []; |
|
} |
|
const original_path = `./img/${event_id}.jpg`; |
|
const resized_path = `./img/${event_id}-square.jpg`; |
|
const original = write_image(original_path, image); |
|
const resized_square = write_resized(resized_path, image); |
|
try { |
|
await Promise.all([original, resized_square]); |
|
return { original: original_path, square: resized_path }; |
|
} catch (err) { |
|
console.error(err); |
|
return { original: null }; |
|
} |
|
}; |
|
|
|
const get_city_name = (event) => |
|
pathOr('', 'event_place.city.contextual_name'.split('.'), event); |
|
|
|
const get_event_host = (event) => |
|
pathOr('', 'event_place.contextual_name'.split('.'), event); |
|
|
|
const create_images_directory = (images_directory) => { |
|
if (images_directory === null || images_directory === undefined) { |
|
return Promise.reject('Image path was not set'); |
|
} |
|
|
|
if (!filesystem.existsSync(images_directory)) { |
|
return fs.mkdir(images_directory, { recursive: true }).catch(console.error); |
|
} |
|
|
|
return Promise.resolve(); |
|
}; |
|
|
|
const read_previous_events = (path) => { |
|
if (path !== null) { |
|
if (filesystem.existsSync(path)) { |
|
return fs |
|
.readFile(path, { encoding: 'utf-8' }) |
|
.then((content) => JSON.parse(content)) |
|
.catch((error) => { |
|
console.error(error); |
|
process.exit(1); |
|
}); |
|
} |
|
} |
|
return Promise.resolve([]); |
|
}; |
|
|
|
const load_event = async (page, event_id) => { |
|
try { |
|
const image_data = new Promise((resolve) => { |
|
const images = []; |
|
page.on('response', async (response) => { |
|
const response_url = response.request().url(); |
|
const { pathname } = new URL(response_url); |
|
const ext = path.extname(pathname); |
|
if (ext === '.jpg') { |
|
const image = await response.buffer(); |
|
images.push(image); |
|
} |
|
}); |
|
page.on('domcontentloaded', async () => { |
|
resolve(images); |
|
}); |
|
}); |
|
|
|
await page.goto(event_url(event_id)); |
|
const images = await image_data; |
|
const image = images.reduce((res, image) => |
|
maxBy((item) => item.length, res, image), |
|
); |
|
return { image }; |
|
} catch (e) { |
|
console.error(e); |
|
} |
|
}; |
|
|
|
const map_event = ({ node: event }) => { |
|
const ticket_url = pathOr('', ['event_buy_ticket_url'], event); |
|
const city = get_city_name(event); |
|
const host = get_event_host(event); |
|
return { |
|
date: event.time_range, |
|
name: event.name, |
|
event_id: event.id, |
|
ticket_url, |
|
location: { |
|
host: host, |
|
location: city, |
|
}, |
|
}; |
|
}; |
|
|
|
const open_browser = async () => { |
|
const browser = await puppeteer.launch({ |
|
headless: false, |
|
args: ['--disable-dev-shm-usage'], |
|
}); |
|
return browser; |
|
}; |
|
|
|
const has_upcoming_events = async (page) => |
|
await page.evaluate( |
|
'let txt = document.querySelector("body").innerText;txt.includes("Upcoming events") && !txt.includes("not have any upcoming events")', |
|
); |
|
|
|
const has_past_events = async (page) => |
|
await page.evaluate( |
|
'let inner = document.querySelector("body").innerText;inner.includes("Past events") && !inner.includes("not have any past events")', |
|
); |
|
|
|
const register_page_scraper = (endpoint, page, past_events = false) => { |
|
let responses = []; |
|
return new Promise((resolve, reject) => { |
|
page.on('response', async (response) => { |
|
if (endpoint === response.request().url()) { |
|
let json = {}; |
|
try { |
|
json = await response.json(); |
|
} catch (error) { |
|
return responses; |
|
} |
|
|
|
const getters = { |
|
upcoming: get_upcoming_events_from_page, |
|
past: get_past_events_from_page, |
|
}; |
|
|
|
const events = getters[past_events ? 'past' : 'upcoming'](json); |
|
if (events !== null) { |
|
responses = [events, ...responses]; |
|
if (!events.page_info.has_next_page) { |
|
resolve(responses); |
|
} |
|
} |
|
} |
|
}); |
|
}); |
|
}; |
|
|
|
const get_page_events = async ( |
|
browser, |
|
page_id, |
|
get_upcoming_events = true, |
|
get_past_events = false, |
|
) => { |
|
const facebook_page = await browser.newPage(); |
|
|
|
let past_events = []; |
|
let upcoming_events = []; |
|
|
|
let scraping_past_events = false; |
|
let scraping_upcoming_events = false; |
|
|
|
if (get_past_events) { |
|
scraping_past_events = true; |
|
|
|
// set turn off timeout |
|
past_events = register_page_scraper(graphql_endpoint, facebook_page, true) |
|
.then((past_events) => { |
|
scraping_past_events = false; |
|
return past_events; |
|
}) |
|
.catch((err) => { |
|
console.error(err); |
|
scraping_past_events = false; |
|
return []; |
|
}); |
|
} else { |
|
past_events = Promise.resolve([]); |
|
} |
|
if (get_upcoming_events) { |
|
scraping_upcoming_events = true; |
|
// set turn off timeout |
|
upcoming_events = register_page_scraper(graphql_endpoint, facebook_page) |
|
.then((upcoming_events) => { |
|
scraping_upcoming_events = false; |
|
return upcoming_events; |
|
}) |
|
.catch((err) => { |
|
console.error(err); |
|
scraping_upcoming_events = false; |
|
return []; |
|
}); |
|
} else { |
|
upcoming_events = Promise.resolve([]); |
|
} |
|
|
|
await facebook_page.goto(page_id); |
|
await facebook_page.waitFor(2000); |
|
|
|
const past_resolved = |
|
get_past_events && !(await has_past_events(facebook_page)); |
|
const upcoming_resolved = |
|
get_upcoming_events && !(await has_upcoming_events(facebook_page)); |
|
|
|
if (past_resolved) { |
|
past_events = Promise.resolve([]); |
|
scraping_past_events = false; |
|
} |
|
|
|
if (upcoming_resolved) { |
|
upcoming_events = Promise.resolve([]); |
|
scraping_upcoming_events = false; |
|
} |
|
|
|
while (scraping_past_events || scraping_upcoming_events) { |
|
await facebook_page.waitFor(1000); |
|
await facebook_page.evaluate(() => window.scrollBy(0, window.innerHeight)); |
|
if (past_resolved && upcoming_resolved) { |
|
break; |
|
} |
|
} |
|
|
|
upcoming_events = await upcoming_events; |
|
past_events = await past_events; |
|
|
|
const responses = [...upcoming_events, ...past_events]; |
|
const nodes = responses.reduce( |
|
(res, current) => [...res, ...current.edges], |
|
[], |
|
); |
|
|
|
return nodes.map(map_event); |
|
}; |
|
|
|
module.exports = { |
|
create_images_directory, |
|
open_browser, |
|
parse_args, |
|
read_previous_events, |
|
get_page_events, |
|
merge_edges, |
|
};
|
|
|