Browse Source

cleanup some shit

fix-broken-scrape
Jørgen Sverre Lien Sellæg 4 years ago
parent
commit
0dd1ffbb9d
  1. 139
      src/browse.js
  2. 123
      src/facebook-page-events.js
  3. 54
      src/facebook-payload.txt
  4. 0
      src/facebook/get-event.mjs
  5. 2
      src/facebook/get-page-events.mjs
  6. 1
      src/facebook/graphql-api-request.mjs
  7. 7
      src/maldito-mapper.js
  8. 2636
      src/output.json
  9. 92
      src/scrape-event.js
  10. 42
      src/scrape.js

139
src/browse.js

@ -1,139 +0,0 @@
const puppeteer = require('puppeteer');
const {
has_past_events,
has_upcoming_events,
get_upcoming_events_from_page,
get_past_events_from_page,
map_event,
} = require('./logic');
const { graphql_endpoint } = require('./constants');
const open_browser = async ({ headless }) => {
const browser = await puppeteer.launch({
headless,
args: ['--disable-dev-shm-usage'],
});
return browser;
};
const register_page_scraper = (endpoint, page, past_events = false) => {
let responses = [];
return new Promise((resolve) => {
page.on('response', async (response) => {
if (endpoint === response.request().url()) {
let json = {};
try {
json = await response.json();
} catch (error) {
return responses;
}
const getters = {
upcoming: get_upcoming_events_from_page,
past: get_past_events_from_page,
};
const events = getters[past_events ? 'past' : 'upcoming'](json);
if (events !== null) {
responses = [events, ...responses];
if (!events.page_info.has_next_page) {
resolve(responses);
}
}
}
});
});
};
const get_body_inner_text = async (page) =>
await page.evaluate('document.querySelector("body").innerText;');
const get_page_events = async (opt) => {
const browser = await open_browser(opt);
const facebook_page = await browser.newPage();
let past_events = [];
let upcoming_events = [];
let scraping_past_events = false;
let scraping_upcoming_events = false;
if (opt.get_past_events) {
scraping_past_events = true;
past_events = register_page_scraper(graphql_endpoint, facebook_page, true)
.then((past_events) => {
scraping_past_events = false;
return past_events;
})
.catch((err) => {
console.error(err);
scraping_past_events = false;
return [];
});
} else {
past_events = Promise.resolve([]);
}
if (opt.get_upcoming_events) {
scraping_upcoming_events = true;
upcoming_events = register_page_scraper(graphql_endpoint, facebook_page)
.then((upcoming_events) => {
scraping_upcoming_events = false;
return upcoming_events;
})
.catch((err) => {
console.error(err);
scraping_upcoming_events = false;
return [];
});
} else {
upcoming_events = Promise.resolve([]);
}
await facebook_page.goto(opt.page_id);
await facebook_page.waitFor(2000);
const accept_buttons = await facebook_page.$x(
"//button[contains(text(), 'Accept All')]",
);
if (accept_buttons.length > 0) {
accept_buttons[0].click();
}
const body_text = (await get_body_inner_text(facebook_page)).toLowerCase();
const past_resolved = opt.get_past_events && !has_past_events(body_text);
const upcoming_resolved =
opt.get_upcoming_events && !has_upcoming_events(body_text);
if (past_resolved) {
past_events = Promise.resolve([]);
scraping_past_events = false;
}
if (upcoming_resolved) {
upcoming_events = Promise.resolve([]);
scraping_upcoming_events = false;
}
while (scraping_past_events || scraping_upcoming_events) {
await facebook_page.waitFor(1000);
await facebook_page.evaluate(() => window.scrollBy(0, window.innerHeight));
if (past_resolved && upcoming_resolved) {
break;
}
}
upcoming_events = await upcoming_events;
past_events = await past_events;
const responses = [...upcoming_events, ...past_events];
const nodes = responses.reduce(
(res, current) => [...res, ...current.edges],
[],
);
return nodes.map(map_event);
};
module.exports = {
get_page_events,
};

123
src/facebook-page-events.js

@ -1,123 +0,0 @@
const {
get_edges,
get_page_info,
get_past_events_from_page,
get_upcoming_events_from_page,
sleep,
} = require('./logic');
const { do_request } = require('./facebook-request');
const last = require('ramda/src/last');
const get_events = async (doc_id, variables, get_events_from_page, edges) => {
let { cursor } = variables;
let next = cursor !== null;
while (next) {
let page;
page = await do_request(doc_id, variables);
const { has_next_page } = get_page_info(page);
page = get_events_from_page(page);
edges = [...edges, ...get_edges(page)];
next = has_next_page;
cursor = last(edges).cursor;
await sleep(2);
}
return edges;
};
const get_upcoming_events = async (page_id, { edges, cursor }) => {
const doc_id = '4766951026653856';
const variables = {
count: 3,
cursor: cursor,
scale: 1,
id: `${page_id}`,
};
const upcoming_events = await get_events(
doc_id,
variables,
get_upcoming_events_from_page,
edges,
);
return upcoming_events;
};
const get_past_events = async (page_id, events) => {
const doc_id = '4082043558578171';
const variables = {};
const past_events = await get_events(
doc_id,
variables,
get_past_events_from_page,
events,
);
return past_events;
};
const get_initial_events = (page_id) => {
const doc_id = '4071780429584964';
const variables = { pageID: `${page_id}`, scale: 1 };
return do_request(doc_id, variables);
};
const get_reoccuring_events = () => {};
const init_scrape = async (page_id) => {
const res = await get_initial_events(page_id);
const upcoming_events = get_upcoming_events_from_page(res);
const past_events = get_past_events_from_page(res);
const upcoming_edges = get_edges(upcoming_events);
const past_edges = get_edges(past_events);
const upcoming_has_next_page = get_page_info(upcoming_events).has_next_page;
const past_has_next_page = get_page_info(past_events).has_next_page;
return {
upcoming_events: {
edges: upcoming_edges,
cursor: upcoming_has_next_page ? last(upcoming_edges).cursor : null,
},
past_events: {
cursor: past_has_next_page ? last(past_edges).cursor : null,
edges: past_edges,
},
};
};
const get_page_events = async (opt) => {
const { page_id } = opt;
let { upcoming_events, past_events } = await init_scrape(page_id);
/* if (opt.get_past_events) {
* await sleep(2);
* past_events = await get_past_events(page_id, past_events);
* } else {
* past_events = [];
* }
*/
past_events = [];
if (opt.get_upcoming_events) {
await sleep(2);
upcoming_events = await get_upcoming_events(page_id, upcoming_events);
} else {
upcoming_events = [];
}
const nodes = [...upcoming_events, ...past_events];
return nodes.map(({ node }) => node);
};
module.exports = {
get_page_events,
};

54
src/facebook-payload.txt

File diff suppressed because one or more lines are too long

0
src/facebook-event.js → src/facebook/get-event.mjs

2
src/darma.mjs → src/facebook/get-page-events.mjs

@ -1,4 +1,4 @@
import { do_request } from './facebook-request.mjs'; import { do_request } from './graphql-api-request.mjs';
const sleep = (s) => new Promise((res) => setTimeout(res, s * 1000)); const sleep = (s) => new Promise((res) => setTimeout(res, s * 1000));
/// PageEventsTabPastEventsCardRendererQuery /// PageEventsTabPastEventsCardRendererQuery

1
src/facebook-request.mjs → src/facebook/graphql-api-request.mjs

@ -1,7 +1,6 @@
import fetch from 'node-fetch'; import fetch from 'node-fetch';
const graphql_endpoint = 'https://www.facebook.com/api/graphql/'; const graphql_endpoint = 'https://www.facebook.com/api/graphql/';
import * as url from 'url'; import * as url from 'url';
import https_proxy_agent from 'https-proxy-agent'; import https_proxy_agent from 'https-proxy-agent';
const proxies = [ const proxies = [

7
src/maldito-mapper.js

@ -1,7 +0,0 @@
const input = require('./output.json');
const { by_date, event_times_to_dates, map_event } = require('./logic');
const events = input.map(map_event).map(event_times_to_dates).sort(by_date);
console.log(JSON.stringify(events));

2636
src/output.json

File diff suppressed because it is too large Load Diff

92
src/scrape-event.js

@ -1,92 +0,0 @@
const {
parse_args,
sleep,
read_previous_events,
write_events,
} = require('./logic');
const { get_event_details } = require('./facebook-event.js');
const { get_page_events } = require('./facebook-page-events.js');
const { omit } = require('ramda');
const options = parse_args(process.argv.slice(2));
(async () => {
const previous_events = await read_previous_events(options.events);
const page_events = await get_page_events(options);
let events = [];
for (const event of page_events) {
const index = previous_events.findIndex(({ id }) => event.id == id);
if (index === -1) {
sleep(2);
console.error(
`INFO: New event. ${event.name}, fetching details. ${event.url}`,
);
const event_details = await get_event_details(event.id);
events.push({ ...event, ...event_details });
} else if (previous_events[index].updated_time != event.updated_time) {
sleep(2);
console.error(`INFO: Event needs ${event.name} updating. ${event.url}`);
const event_details = await get_event_details(event.id);
events.push({ ...event, ...event_details });
} else {
console.error(`INFO: Event ${event.name} already scraped. ${event.url}`);
events.push(previous_events[index]);
}
}
let i = 0;
for (const event of previous_events) {
i = events.findIndex(({ id }) => id === event.id);
if (i === -1) {
events.push(event);
}
i = 0;
}
const filterAwayFields = omit([
'ad_groups',
'ads_data',
'can_viewer_promote',
'can_viewer_purchase_onsite_tickets',
'categoryInfo',
'child_events',
'cover_photo',
'cover_video',
'event_insights',
'event_place',
'event_promotion_info',
'event_ticketing_type',
'event_viewer_capability',
'has_child_events',
'has_viewer_sent_message_or_requested_tickets',
'is_boostable',
'is_event_draft',
'is_past',
'is_pay_to_access_content',
'is_viewer_user_admin_of_page',
'parent_event',
'poe_violation_state',
'scheduled_publish_timestamp',
'ticket_tiers',
]);
events = events.map(filterAwayFields);
if (options.output === null) {
console.log(JSON.stringify(events));
process.exit();
}
try {
await write_events(options.output, events);
process.exit();
} catch (e) {
console.error(e);
process.exit(1);
}
})();

42
src/scrape.js

@ -1,42 +0,0 @@
const {
by_date,
event_date_to_date_obj,
parse_args,
read_previous_events,
to_unique_events,
write_events,
} = require('./logic');
const { get_page_events } = require('./facebook-page-events');
const options = parse_args(process.argv.slice(2));
(async () => {
let events = [];
let prev_events = [];
try {
events = await get_page_events(options);
prev_events = await read_previous_events(options.events);
} catch (e) {
console.error(e);
}
events = events
.reduce(to_unique_events, prev_events)
.map(event_date_to_date_obj)
.sort(by_date);
if (options.output === null) {
console.log(JSON.stringify(events));
process.exit();
}
try {
await write_events(options.output, events);
process.exit();
} catch (e) {
console.error(e);
process.exit(1);
}
})();
Loading…
Cancel
Save