Browse Source

update scraper to destinguish past and upcoming events

fix-broken-scrape
Jørgen Lien Sellæg 6 years ago
parent
commit
32c7606219
  1. 133
      src/logic.js
  2. 2
      src/scrape.js

133
src/logic.js

@ -215,64 +215,129 @@ const map_event = ({ node: event }) => {
const open_browser = async () => { const open_browser = async () => {
const browser = await puppeteer.launch({ const browser = await puppeteer.launch({
headless: true, headless: false,
args: ['--disable-dev-shm-usage'], args: ['--disable-dev-shm-usage'],
}); });
return browser; return browser;
}; };
const register_page_scraper = (endpoint, page) => { const has_upcoming_events = async (page) =>
await page.evaluate(
'let txt = document.querySelector("body").innerText;txt.includes("Upcoming events") && !txt.includes("not have any upcoming events")',
);
const has_past_events = async (page) =>
await page.evaluate(
'let inner = document.querySelector("body").innerText;inner.includes("Past events") && !inner.includes("not have any past events")',
);
const register_page_scraper = (endpoint, page, past_events = false) => {
let responses = []; let responses = [];
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
page.on('response', async (response) => { page.on('response', async (response) => {
if (endpoint === response.request().url()) { if (endpoint === response.request().url()) {
let json = {};
try { try {
const json = await response.json(); json = await response.json();
const upcoming_events = get_upcoming_events_from_page(json); } catch (error) {
if (upcoming_events !== null) { return responses;
responses = [upcoming_events, ...responses]; }
if (!upcoming_events.page_info.has_next_page) {
resolve(responses); const getters = {
} upcoming: get_upcoming_events_from_page,
} past: get_past_events_from_page,
const past_events = get_past_events_from_page(json); };
if (past_events !== null) {
if ( const events = getters[past_events ? 'past' : 'upcoming'](json);
!past_events.page_info.has_next_page && if (events !== null) {
!upcoming_events.page_info.has_next_page responses = [events, ...responses];
) { if (!events.page_info.has_next_page) {
resolve(responses); resolve(responses);
}
} }
} catch (err) {
reject(err);
} }
} }
}); });
}); });
}; };
const get_page_events = async (browser, page_id, past_events) => { const get_page_events = async (
let scraping = true; browser,
page_id,
get_upcoming_events = true,
get_past_events = false,
) => {
const facebook_page = await browser.newPage(); const facebook_page = await browser.newPage();
const upcoming_events = register_page_scraper(graphql_endpoint, facebook_page)
.then((upcoming_events) => { let past_events = [];
scraping = false; let upcoming_events = [];
return upcoming_events;
}) let scraping_past_events = false;
.catch((err) => { let scraping_upcoming_events = false;
console.error(err);
scraping = false; if (get_past_events) {
}); scraping_past_events = true;
// set turn off timeout
past_events = register_page_scraper(graphql_endpoint, facebook_page, true)
.then((past_events) => {
scraping_past_events = false;
return past_events;
})
.catch((err) => {
console.error(err);
scraping_past_events = false;
return [];
});
} else {
past_events = Promise.resolve([]);
}
if (get_upcoming_events) {
scraping_upcoming_events = true;
// set turn off timeout
upcoming_events = register_page_scraper(graphql_endpoint, facebook_page)
.then((upcoming_events) => {
scraping_upcoming_events = false;
return upcoming_events;
})
.catch((err) => {
console.error(err);
scraping_upcoming_events = false;
return [];
});
} else {
upcoming_events = Promise.resolve([]);
}
await facebook_page.goto(page_id); await facebook_page.goto(page_id);
await facebook_page.waitFor(2000);
const past_resolved =
get_past_events && !(await has_past_events(facebook_page));
const upcoming_resolved =
get_upcoming_events && !(await has_upcoming_events(facebook_page));
if (past_resolved) {
past_events = Promise.resolve([]);
scraping_past_events = false;
}
if (upcoming_resolved) {
upcoming_events = Promise.resolve([]);
scraping_upcoming_events = false;
}
while (scraping) { while (scraping_past_events || scraping_upcoming_events) {
await facebook_page.waitFor(2000); await facebook_page.waitFor(1000);
await facebook_page.evaluate(() => window.scrollBy(0, window.innerHeight)); await facebook_page.evaluate(() => window.scrollBy(0, window.innerHeight));
if (past_resolved && upcoming_resolved) {
break;
}
} }
const responses = await upcoming_events; upcoming_events = await upcoming_events;
past_events = await past_events;
const responses = [...upcoming_events, ...past_events];
const nodes = responses.reduce( const nodes = responses.reduce(
(res, current) => [...res, ...current.edges], (res, current) => [...res, ...current.edges],
[], [],

2
src/scrape.js

@ -57,7 +57,7 @@ const { page_ids, output, events: event_file } = parse_args(
* }), * }),
* ); */ * ); */
const all_events = [...events, ...previous_events] let all_events = merge_edges(events, previous_events)
.map((event) => { .map((event) => {
const start = pathOr(null, ['date', 'start'], event); const start = pathOr(null, ['date', 'start'], event);
if (start !== null) { if (start !== null) {

Loading…
Cancel
Save