Browse Source

scrape all upcoming events following cursor

fix-broken-scrape
Jørgen Lien Sellæg 6 years ago
parent
commit
dcd77573ae
  1. 91
      scrape.js

91
scrape.js

@ -17,27 +17,6 @@ const get_upcoming_events = pathOr(
const merge_edges = unionWith(eqBy(prop('event_id')));
const load_page = async (page, event_page) => {
try {
const graphql_data = new Promise((resolve, reject) => {
page.on('response', async (response) => {
if (graphql_endpoint === response.request().url()) {
const text = await response.json();
const upcoming_events = get_upcoming_events(text);
if (upcoming_events !== null) {
resolve(upcoming_events);
}
}
});
});
await page.goto(event_page);
await page.evaluate(() => window.scrollBy(0, window.innerHeight));
return await graphql_data;
} catch (e) {
console.error(e);
}
};
const load_event = async (page, event_id) => {
try {
const image_data = new Promise((resolve, reject) => {
@ -174,6 +153,28 @@ const event_ids = pathOr('', ['event_ids'], argv)
.filter((str) => str.length !== 0)
.map((event_id) => `https://www.facebook.com/events/${event_id}`);
const register_upcoming_events_listener = (endpoint, page) => {
let responses = [];
return new Promise((resolve, reject) => {
page.on('response', async (response) => {
if (endpoint === response.request().url()) {
try {
const json = await response.json();
const upcoming_events = get_upcoming_events(json);
if (upcoming_events !== null) {
responses = [upcoming_events, ...responses];
if (!upcoming_events.page_info.has_next_page) {
resolve(responses);
}
}
} catch (err) {
reject(err);
}
}
});
});
};
(async () => {
create_images_directory('./events/img');
@ -182,31 +183,33 @@ const event_ids = pathOr('', ['event_ids'], argv)
let events = [];
for (let page_id of page_ids) {
let scraping = true;
const facebook_page = await browser.newPage();
const data = await load_page(facebook_page, page_id);
const edges = data.edges.map(map_event);
events = merge_edges(edges, events);
events = await Promise.all(
events.map(async (event) => {
const event_page = await browser.newPage();
const event_data = await load_event(event_page, event.event_id);
return {
...event_data,
...event,
};
}),
);
const upcoming_events = register_upcoming_events_listener(
graphql_endpoint,
facebook_page,
)
.then((upcoming_events) => {
scraping = false;
return upcoming_events;
})
.catch((err) => {
console.error(err);
scraping = false;
});
await facebook_page.goto(page_id);
while (scraping) {
await facebook_page.evaluate(() =>
window.scrollBy(0, window.innerHeight),
);
}
events = await Promise.all(
events.map(async (event) => {
const images = await save_images(event);
delete event.image;
return {
images,
...event,
};
}),
const responses = await upcoming_events;
const nodes = responses.reduce(
(res, current) => [...res, ...current.edges],
[],
);
}
console.log(JSON.stringify(events));

Loading…
Cancel
Save