diff --git a/package.json b/package.json index 032be92..29aef08 100644 --- a/package.json +++ b/package.json @@ -3,6 +3,7 @@ "dependencies": { "gm": "^1.23.1", "minimist": "^1.2.5", - "puppeteer": "^3.0.2" + "puppeteer": "^3.0.2", + "ramda": "^0.27.0" } } diff --git a/scrape.js b/scrape.js index 37e8834..bd9c0b7 100644 --- a/scrape.js +++ b/scrape.js @@ -1,185 +1,42 @@ const puppeteer = require('puppeteer'); -const { JSDOM } = require('jsdom'); -const event_page = 'https://www.facebook.com/KulturVarsel/events'; +const { pathOr } = require('ramda'); +const parse_args = require('minimist'); -// const event_page = 'https://www.facebook.com/pg/bandmaldito/events'; -// const event_page = 'file:///home/zalox/src/kultar-events/index.chrome.html'; +const graphql_endpoint = 'https://www.facebook.com/api/graphql/'; -const upcoming_event_id = 'upcoming_events_card'; -const upcoming_event_selector = `#upcoming_events_card > div > div:nth-child(2) > table > tbody > tr`; +const get_upcoming_events = pathOr( + null, + 'data.page.upcoming_events'.split('.'), +); -const month_name_to_number = (month_name) => { - switch (month_name.toUpperCase()) { - case 'JAN': - return 1; - case 'FEB': - return 2; - case 'MAR': - return 3; - case 'APR': - return 4; - case 'MAY': - return 5; - case 'JUN': - return 6; - case 'JUL': - return 7; - case 'AUG': - return 8; - case 'SEP': - return 9; - case 'OCT': - return 10; - case 'NOV': - return 10; - case 'DEC': - return 12; - } -}; - -const parse_event_time = (event_time_text) => { - const reversed_text = event_time_text.split('').reverse().join(''); - if (!(reversed_text.substr(2, 1) == '+')) { - return null; - } - - const timezone = reversed_text.substr(0, 6).split('').reverse().join(''); - const minutes = reversed_text.substr(7, 2).split('').reverse().join(''); - const hour = reversed_text.substr(10, 2).split('').reverse().join('').trim(); - return { - tz: timezone, - min: minutes, - hour, - }; -}; - -const parse_ticket_location = (row) => { - const host = row.children[2].firstChild.firstChild.firstChild.innerHTML; - const location = row.children[2].firstChild.lastChild.innerHTML; - return { - host, - location, - }; -}; - -const parse_event_date = (row) => { - const date_column = row.firstChild.firstChild; - const month_text = new String(date_column.firstChild.innerHTML); - const month = new Number(month_name_to_number(month_text)) - 1; - const day = new Number(date_column.lastChild.innerHTML); - const event_time_text = row.children[1].lastChild.getElementsByTagName( - 'span', - )[1].innerHTML; - const event_time = parse_event_time(event_time_text); - - if (event_time === null) { - return new Date(2020, month, day); - } - - return new Date(2020, month, day, event_time.hour, event_time.min); -}; - -const parse_event_link = (row) => { - const link_text = row.children[1].firstChild.getElementsByTagName('a')[0] - .href; - const event_id = link_text.split('/')[2]; - return event_id; -}; - -const parse_ticket_url = (row) => { - const link_text = row.lastChild.firstChild.firstChild; - - if (link_text.children.length === 0) { - return null; - } - - const url = new URL( - link_text.getElementsByTagName('a')[0].href, - ).searchParams.get('u'); - - return url; -}; - -const parse_event_name = (row) => { - const event_name = row.children[1].firstChild.getElementsByTagName('span')[0] - .innerHTML; - return event_name.trim(); -}; - -const load_page = async () => { +const load_page = async (page, event_page) => { try { - const browser = await puppeteer.launch({ headless: true }); - - const page = await browser.newPage(); - - await page.goto(event_page); - await page.waitForSelector(upcoming_event_selector); - - await page.evaluate(() => { - window.scrollBy(0, window.innerHeight); + const graphql_data = new Promise((resolve, reject) => { + page.on('response', async (response) => { + if (graphql_endpoint === response.request().url()) { + const text = await response.json(); + const upcoming_events = get_upcoming_events(text); + if (upcoming_events !== null) { + resolve(upcoming_events); + } + } + }); }); - - await page.waitFor(1000); - - let get_events = (upcoming_event_id) => { - const upcoming_events_element = document.getElementById( - upcoming_event_id, - ); - - if (upcoming_events_element === null) { - throw new Error(`Element ${upcoming_event_id} was not found.`); - } - - if (upcoming_events_element.firstChild === null) { - throw new Error( - `Element ${upcoming_event_id} firstChild was not found.`, - ); - } - - if (upcoming_events_element.firstChild.children === null) { - throw new Error(`Element ${upcoming_event_id} children not found.`); - } - - return Array.from(upcoming_events_element.firstChild.children).map( - (item) => item.innerHTML, - ); - }; - - const events = await page.evaluate(get_events, upcoming_event_id); - - const htmlToTableRowElement = (table) => { - const { document } = new JSDOM(table).window; - return Array.from(document.body.getElementsByTagName('tr'))[0]; - }; - - const emptyArrays = (item) => item; - - const parseRowToEvents = (table_row) => { - const date = parse_event_date(table_row); - const name = parse_event_name(table_row); - const event_id = parse_event_link(table_row); - const ticket_url = parse_ticket_url(table_row); - const location = parse_ticket_location(table_row); - return { - date, - name, - event_id, - ticket_url, - location, - }; - }; - - const parsed_events = events - .map(htmlToTableRowElement) - .filter(emptyArrays) - .map(parseRowToEvents); - console.log(JSON.stringify(parsed_events)); + await page.goto(event_page); + await page.evaluate(() => window.scrollBy(0, window.innerHeight)); + return await graphql_data; } catch (e) { console.error(e); } }; (async () => { - await load_page(); + const browser = await puppeteer.launch({ + headless: true, + args: ['--disable-dev-shm-usage'], + }); + const page = await browser.newPage(); + const event_page = 'https://www.facebook.com/KulturVarsel/events/'; + console.log(JSON.stringify(await load_page(page, event_page))); process.exit(); })(); diff --git a/yarn.lock b/yarn.lock index bbeb35c..91e56e1 100644 --- a/yarn.lock +++ b/yarn.lock @@ -306,6 +306,11 @@ puppeteer@^3.0.2: unbzip2-stream "^1.3.3" ws "^7.2.3" +ramda@^0.27.0: + version "0.27.0" + resolved "https://registry.yarnpkg.com/ramda/-/ramda-0.27.0.tgz#915dc29865c0800bf3f69b8fd6c279898b59de43" + integrity sha512-pVzZdDpWwWqEVVLshWUHjNwuVP7SfcmPraYuqocJp1yo2U1R7P+5QAfDhdItkuoGqIBnBYrtPp7rEPqDn9HlZA== + readable-stream@^3.1.1, readable-stream@^3.4.0: version "3.6.0" resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-3.6.0.tgz#337bbda3adc0706bd3e024426a286d4b4b2c9198"