const puppeteer = require('puppeteer'); const { JSDOM } = require('jsdom'); const event_page = 'https://www.facebook.com/KulturVarsel/events'; // const event_page = 'https://www.facebook.com/pg/bandmaldito/events'; // const event_page = 'file:///home/zalox/src/kultar-events/index.chrome.html'; const upcoming_event_id = 'upcoming_events_card'; const upcoming_event_selector = `#upcoming_events_card > div > div:nth-child(2) > table > tbody > tr`; const month_name_to_number = (month_name) => { switch (month_name.toUpperCase()) { case 'JAN': return 1; case 'FEB': return 2; case 'MAR': return 3; case 'APR': return 4; case 'MAY': return 5; case 'JUN': return 6; case 'JUL': return 7; case 'AUG': return 8; case 'SEP': return 9; case 'OCT': return 10; case 'NOV': return 10; case 'DEC': return 12; } }; const parse_event_time = (event_time_text) => { const reversed_text = event_time_text.split('').reverse().join(''); if (!(reversed_text.substr(2, 1) == '+')) { return null; } const timezone = reversed_text.substr(0, 6).split('').reverse().join(''); const minutes = reversed_text.substr(7, 2).split('').reverse().join(''); const hour = reversed_text.substr(10, 2).split('').reverse().join('').trim(); return { tz: timezone, min: minutes, hour, }; }; const parse_ticket_location = (row) => { const host = row.children[2].firstChild.firstChild.firstChild.innerHTML; const location = row.children[2].firstChild.lastChild.innerHTML; return { host, location, }; }; const parse_event_date = (row) => { const date_column = row.firstChild.firstChild; const month_text = new String(date_column.firstChild.innerHTML); const month = new Number(month_name_to_number(month_text)) - 1; const day = new Number(date_column.lastChild.innerHTML); const event_time_text = row.children[1].lastChild.getElementsByTagName( 'span', )[1].innerHTML; const event_time = parse_event_time(event_time_text); if (event_time === null) { return new Date(2020, month, day); } return new Date(2020, month, day, event_time.hour, event_time.min); }; const parse_event_link = (row) => { const link_text = row.children[1].firstChild.getElementsByTagName('a')[0] .href; const event_id = link_text.split('/')[2]; return event_id; }; const parse_ticket_url = (row) => { const link_text = row.lastChild.firstChild.firstChild; if (link_text.children.length === 0) { return null; } const url = new URL( link_text.getElementsByTagName('a')[0].href, ).searchParams.get('u'); return url; }; const parse_event_name = (row) => { const event_name = row.children[1].firstChild.getElementsByTagName('span')[0] .innerHTML; return event_name.trim(); }; const load_page = async () => { try { const browser = await puppeteer.launch({ headless: true }); const page = await browser.newPage(); await page.goto(event_page); await page.waitForSelector(upcoming_event_selector); await page.evaluate(() => { window.scrollBy(0, window.innerHeight); }); await page.waitFor(1000); let get_events = (upcoming_event_id) => { const upcoming_events_element = document.getElementById( upcoming_event_id, ); if (upcoming_events_element === null) { throw new Error(`Element ${upcoming_event_id} was not found.`); } if (upcoming_events_element.firstChild === null) { throw new Error( `Element ${upcoming_event_id} firstChild was not found.`, ); } if (upcoming_events_element.firstChild.children === null) { throw new Error(`Element ${upcoming_event_id} children not found.`); } return Array.from(upcoming_events_element.firstChild.children).map( (item) => item.innerHTML, ); }; const events = await page.evaluate(get_events, upcoming_event_id); const htmlToTableRowElement = (table) => { const { document } = new JSDOM(table).window; return Array.from(document.body.getElementsByTagName('tr'))[0]; }; const emptyArrays = (item) => item; const parseRowToEvents = (table_row) => { const date = parse_event_date(table_row); const name = parse_event_name(table_row); const event_id = parse_event_link(table_row); const ticket_url = parse_ticket_url(table_row); const location = parse_ticket_location(table_row); return { date, name, event_id, ticket_url, location, }; }; const parsed_events = events .map(htmlToTableRowElement) .filter(emptyArrays) .map(parseRowToEvents); console.log(JSON.stringify(parsed_events)); } catch (e) { console.error(e); } }; (async () => { await load_page(); process.exit(); })();