Browse Source

now scrapes with listening on graphql requests

fix-broken-scrape
Jørgen Lien Sellæg 6 years ago
parent
commit
0456ed61a2
  1. 3
      package.json
  2. 197
      scrape.js
  3. 5
      yarn.lock

3
package.json

@ -3,6 +3,7 @@
"dependencies": {
"gm": "^1.23.1",
"minimist": "^1.2.5",
"puppeteer": "^3.0.2"
"puppeteer": "^3.0.2",
"ramda": "^0.27.0"
}
}

197
scrape.js

@ -1,185 +1,42 @@
const puppeteer = require('puppeteer');
const { JSDOM } = require('jsdom');
const event_page = 'https://www.facebook.com/KulturVarsel/events';
const { pathOr } = require('ramda');
const parse_args = require('minimist');
// const event_page = 'https://www.facebook.com/pg/bandmaldito/events';
// const event_page = 'file:///home/zalox/src/kultar-events/index.chrome.html';
const graphql_endpoint = 'https://www.facebook.com/api/graphql/';
const upcoming_event_id = 'upcoming_events_card';
const upcoming_event_selector = `#upcoming_events_card > div > div:nth-child(2) > table > tbody > tr`;
const month_name_to_number = (month_name) => {
switch (month_name.toUpperCase()) {
case 'JAN':
return 1;
case 'FEB':
return 2;
case 'MAR':
return 3;
case 'APR':
return 4;
case 'MAY':
return 5;
case 'JUN':
return 6;
case 'JUL':
return 7;
case 'AUG':
return 8;
case 'SEP':
return 9;
case 'OCT':
return 10;
case 'NOV':
return 10;
case 'DEC':
return 12;
}
};
const parse_event_time = (event_time_text) => {
const reversed_text = event_time_text.split('').reverse().join('');
if (!(reversed_text.substr(2, 1) == '+')) {
return null;
}
const timezone = reversed_text.substr(0, 6).split('').reverse().join('');
const minutes = reversed_text.substr(7, 2).split('').reverse().join('');
const hour = reversed_text.substr(10, 2).split('').reverse().join('').trim();
return {
tz: timezone,
min: minutes,
hour,
};
};
const parse_ticket_location = (row) => {
const host = row.children[2].firstChild.firstChild.firstChild.innerHTML;
const location = row.children[2].firstChild.lastChild.innerHTML;
return {
host,
location,
};
};
const parse_event_date = (row) => {
const date_column = row.firstChild.firstChild;
const month_text = new String(date_column.firstChild.innerHTML);
const month = new Number(month_name_to_number(month_text)) - 1;
const day = new Number(date_column.lastChild.innerHTML);
const event_time_text = row.children[1].lastChild.getElementsByTagName(
'span',
)[1].innerHTML;
const event_time = parse_event_time(event_time_text);
if (event_time === null) {
return new Date(2020, month, day);
}
return new Date(2020, month, day, event_time.hour, event_time.min);
};
const parse_event_link = (row) => {
const link_text = row.children[1].firstChild.getElementsByTagName('a')[0]
.href;
const event_id = link_text.split('/')[2];
return event_id;
};
const parse_ticket_url = (row) => {
const link_text = row.lastChild.firstChild.firstChild;
if (link_text.children.length === 0) {
return null;
}
const url = new URL(
link_text.getElementsByTagName('a')[0].href,
).searchParams.get('u');
return url;
};
const parse_event_name = (row) => {
const event_name = row.children[1].firstChild.getElementsByTagName('span')[0]
.innerHTML;
return event_name.trim();
};
const load_page = async () => {
try {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto(event_page);
await page.waitForSelector(upcoming_event_selector);
await page.evaluate(() => {
window.scrollBy(0, window.innerHeight);
});
await page.waitFor(1000);
let get_events = (upcoming_event_id) => {
const upcoming_events_element = document.getElementById(
upcoming_event_id,
const get_upcoming_events = pathOr(
null,
'data.page.upcoming_events'.split('.'),
);
if (upcoming_events_element === null) {
throw new Error(`Element ${upcoming_event_id} was not found.`);
const load_page = async (page, event_page) => {
try {
const graphql_data = new Promise((resolve, reject) => {
page.on('response', async (response) => {
if (graphql_endpoint === response.request().url()) {
const text = await response.json();
const upcoming_events = get_upcoming_events(text);
if (upcoming_events !== null) {
resolve(upcoming_events);
}
if (upcoming_events_element.firstChild === null) {
throw new Error(
`Element ${upcoming_event_id} firstChild was not found.`,
);
}
if (upcoming_events_element.firstChild.children === null) {
throw new Error(`Element ${upcoming_event_id} children not found.`);
}
return Array.from(upcoming_events_element.firstChild.children).map(
(item) => item.innerHTML,
);
};
const events = await page.evaluate(get_events, upcoming_event_id);
const htmlToTableRowElement = (table) => {
const { document } = new JSDOM(table).window;
return Array.from(document.body.getElementsByTagName('tr'))[0];
};
const emptyArrays = (item) => item;
const parseRowToEvents = (table_row) => {
const date = parse_event_date(table_row);
const name = parse_event_name(table_row);
const event_id = parse_event_link(table_row);
const ticket_url = parse_ticket_url(table_row);
const location = parse_ticket_location(table_row);
return {
date,
name,
event_id,
ticket_url,
location,
};
};
const parsed_events = events
.map(htmlToTableRowElement)
.filter(emptyArrays)
.map(parseRowToEvents);
console.log(JSON.stringify(parsed_events));
});
});
await page.goto(event_page);
await page.evaluate(() => window.scrollBy(0, window.innerHeight));
return await graphql_data;
} catch (e) {
console.error(e);
}
};
(async () => {
await load_page();
const browser = await puppeteer.launch({
headless: true,
args: ['--disable-dev-shm-usage'],
});
const page = await browser.newPage();
const event_page = 'https://www.facebook.com/KulturVarsel/events/';
console.log(JSON.stringify(await load_page(page, event_page)));
process.exit();
})();

5
yarn.lock

@ -306,6 +306,11 @@ puppeteer@^3.0.2:
unbzip2-stream "^1.3.3"
ws "^7.2.3"
ramda@^0.27.0:
version "0.27.0"
resolved "https://registry.yarnpkg.com/ramda/-/ramda-0.27.0.tgz#915dc29865c0800bf3f69b8fd6c279898b59de43"
integrity sha512-pVzZdDpWwWqEVVLshWUHjNwuVP7SfcmPraYuqocJp1yo2U1R7P+5QAfDhdItkuoGqIBnBYrtPp7rEPqDn9HlZA==
readable-stream@^3.1.1, readable-stream@^3.4.0:
version "3.6.0"
resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-3.6.0.tgz#337bbda3adc0706bd3e024426a286d4b4b2c9198"

Loading…
Cancel
Save