|
|
|
|
const puppeteer = require('puppeteer');
|
|
|
|
|
const { pathOr, maxBy } = require('ramda');
|
|
|
|
|
const path = require('path');
|
|
|
|
|
const fs = require('fs').promises;
|
|
|
|
|
const filesystem = require('fs');
|
|
|
|
|
|
|
|
|
|
const { graphql_endpoint } = require('./constants');
|
|
|
|
|
const {
|
|
|
|
|
event_url,
|
|
|
|
|
get_city_name,
|
|
|
|
|
get_event_host,
|
|
|
|
|
get_past_events,
|
|
|
|
|
get_upcoming_events,
|
|
|
|
|
merge_edges,
|
|
|
|
|
parse_args,
|
|
|
|
|
save_images,
|
|
|
|
|
} = require('./logic');
|
|
|
|
|
|
|
|
|
|
const load_event = async (page, event_id) => {
|
|
|
|
|
try {
|
|
|
|
|
const image_data = new Promise((resolve) => {
|
|
|
|
|
const images = [];
|
|
|
|
|
page.on('response', async (response) => {
|
|
|
|
|
const response_url = response.request().url();
|
|
|
|
|
const { pathname } = new URL(response_url);
|
|
|
|
|
const ext = path.extname(pathname);
|
|
|
|
|
if (ext === '.jpg') {
|
|
|
|
|
const image = await response.buffer();
|
|
|
|
|
images.push(image);
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
page.on('domcontentloaded', async () => {
|
|
|
|
|
resolve(images);
|
|
|
|
|
});
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
await page.goto(event_url(event_id));
|
|
|
|
|
const images = await image_data;
|
|
|
|
|
const image = images.reduce((res, image) =>
|
|
|
|
|
maxBy((item) => item.length, res, image),
|
|
|
|
|
);
|
|
|
|
|
return { image };
|
|
|
|
|
} catch (e) {
|
|
|
|
|
console.error(e);
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
const map_event = ({ node: event }) => {
|
|
|
|
|
const ticket_url = pathOr('', ['event_buy_ticket_url'], event);
|
|
|
|
|
const city = get_city_name(event);
|
|
|
|
|
const host = get_event_host(event);
|
|
|
|
|
return {
|
|
|
|
|
date: event.time_range,
|
|
|
|
|
name: event.name,
|
|
|
|
|
event_id: event.id,
|
|
|
|
|
ticket_url,
|
|
|
|
|
location: {
|
|
|
|
|
host: host,
|
|
|
|
|
location: city,
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
const create_images_directory = (images_directory) =>
|
|
|
|
|
fs.mkdir(images_directory, { recursive: true }).catch(console.error);
|
|
|
|
|
|
|
|
|
|
const open_browser = async () => {
|
|
|
|
|
const browser = await puppeteer.launch({
|
|
|
|
|
headless: true,
|
|
|
|
|
args: ['--disable-dev-shm-usage'],
|
|
|
|
|
});
|
|
|
|
|
return browser;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
const register_upcoming_events_listener = (endpoint, page) => {
|
|
|
|
|
let responses = [];
|
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
|
page.on('response', async (response) => {
|
|
|
|
|
if (endpoint === response.request().url()) {
|
|
|
|
|
try {
|
|
|
|
|
const json = await response.json();
|
|
|
|
|
const upcoming_events = get_upcoming_events(json);
|
|
|
|
|
if (upcoming_events !== null) {
|
|
|
|
|
responses = [upcoming_events, ...responses];
|
|
|
|
|
if (!upcoming_events.page_info.has_next_page) {
|
|
|
|
|
resolve(responses);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
const past_events = get_past_events(json);
|
|
|
|
|
if (past_events !== null) {
|
|
|
|
|
if (!past_events.page_info.has_next_page) {
|
|
|
|
|
resolve(responses);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} catch (err) {
|
|
|
|
|
reject(err);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
});
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
const { page_ids, output, events: event_file } = parse_args(
|
|
|
|
|
process.argv.slice(2),
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
const read_previous_events = (path) => {
|
|
|
|
|
if (path !== null) {
|
|
|
|
|
if (filesystem.existsSync(path)) {
|
|
|
|
|
return fs
|
|
|
|
|
.readFile(path, { encoding: 'utf-8' })
|
|
|
|
|
.then((content) => JSON.parse(content))
|
|
|
|
|
.catch(console.error);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return Promise.resolve([]);
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
(async () => {
|
|
|
|
|
create_images_directory('./img');
|
|
|
|
|
|
|
|
|
|
const previous_events = await read_previous_events(event_file);
|
|
|
|
|
|
|
|
|
|
const browser = await open_browser();
|
|
|
|
|
|
|
|
|
|
let events = [];
|
|
|
|
|
|
|
|
|
|
for (let page_id of page_ids) {
|
|
|
|
|
let scraping = true;
|
|
|
|
|
const facebook_page = await browser.newPage();
|
|
|
|
|
const upcoming_events = register_upcoming_events_listener(
|
|
|
|
|
graphql_endpoint,
|
|
|
|
|
facebook_page,
|
|
|
|
|
)
|
|
|
|
|
.then((upcoming_events) => {
|
|
|
|
|
scraping = false;
|
|
|
|
|
return upcoming_events;
|
|
|
|
|
})
|
|
|
|
|
.catch((err) => {
|
|
|
|
|
console.error(err);
|
|
|
|
|
scraping = false;
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
await facebook_page.goto(page_id);
|
|
|
|
|
|
|
|
|
|
while (scraping) {
|
|
|
|
|
await facebook_page.evaluate(() =>
|
|
|
|
|
window.scrollBy(0, window.innerHeight),
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const responses = await upcoming_events;
|
|
|
|
|
const nodes = responses.reduce(
|
|
|
|
|
(res, current) => [...res, ...current.edges],
|
|
|
|
|
[],
|
|
|
|
|
);
|
|
|
|
|
const new_events = nodes.map(map_event);
|
|
|
|
|
|
|
|
|
|
events = merge_edges(new_events, events);
|
|
|
|
|
events = events.filter(
|
|
|
|
|
({ event_id }) =>
|
|
|
|
|
previous_events.find(
|
|
|
|
|
(previous_event) => event_id === previous_event.event_id,
|
|
|
|
|
) === undefined,
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
events = await Promise.all(
|
|
|
|
|
events.map(async (event) => {
|
|
|
|
|
const event_page = await browser.newPage();
|
|
|
|
|
const event_data = await load_event(event_page, event.event_id);
|
|
|
|
|
return {
|
|
|
|
|
...event_data,
|
|
|
|
|
...event,
|
|
|
|
|
};
|
|
|
|
|
}),
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
events = await Promise.all(
|
|
|
|
|
events.map(async (event) => {
|
|
|
|
|
const images = await save_images(event);
|
|
|
|
|
delete event.image;
|
|
|
|
|
return {
|
|
|
|
|
images,
|
|
|
|
|
...event,
|
|
|
|
|
};
|
|
|
|
|
}),
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
if (output === null) {
|
|
|
|
|
const all_events = [...events, ...previous_events]
|
|
|
|
|
.map((event) => {
|
|
|
|
|
const start = pathOr(null, ['date', 'start'], event);
|
|
|
|
|
if (start !== null) {
|
|
|
|
|
try {
|
|
|
|
|
event.date.start = new Date(start);
|
|
|
|
|
} catch (e) {
|
|
|
|
|
console.error(e);
|
|
|
|
|
return event;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return event;
|
|
|
|
|
})
|
|
|
|
|
.sort((a, b) => {
|
|
|
|
|
const b_date = b.date.start;
|
|
|
|
|
const a_date = a.date.start;
|
|
|
|
|
if (a_date > b_date) {
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
if (a_date == b_date) {
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
return -1;
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
console.log(JSON.stringify(all_events));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
process.exit();
|
|
|
|
|
})();
|