initial release
This commit is contained in:
172
crawler/index.ts
Normal file
172
crawler/index.ts
Normal file
@@ -0,0 +1,172 @@
|
||||
// import puppeteer, { Browser, ElementHandle, Page } from 'puppeteer-core';
|
||||
import puppeteer, { Browser, ElementHandle, Page } from 'puppeteer';
|
||||
import { BusinessListing } from "../common-models/src/main.model"
|
||||
import currency from 'currency.js';
|
||||
import fs from 'fs-extra'
|
||||
|
||||
export interface KeyValue {
|
||||
name: string;
|
||||
value: string;
|
||||
}
|
||||
const typesOfBusiness: Array<KeyValue> = [
|
||||
{ name: 'Automotive', value: '1' },
|
||||
{ name: 'Industrial Services', value: '2' },
|
||||
{ name: 'Real Estate', value: '3' },
|
||||
{ name: 'Uncategorized', value: '4' },
|
||||
{ name: 'Retail', value: '5' },
|
||||
{ name: 'Oilfield SVE and MFG.', value: '6' },
|
||||
{ name: 'Service', value: '7' },
|
||||
{ name: 'Advertising', value: '8' },
|
||||
{ name: 'Agriculture', value: '9' },
|
||||
{ name: 'Franchise', value: '10' },
|
||||
{ name: 'Professional', value: '11' },
|
||||
{ name: 'Manufacturing', value: '12' },
|
||||
{ name: 'Food and Restaurant', value: '13' },
|
||||
];
|
||||
|
||||
async function getParentElementText(elementHandle: ElementHandle<Element> | null) {
|
||||
const textContent = elementHandle
|
||||
? await elementHandle.evaluate((el) => {
|
||||
const getText = (nodes: Node[]) => {
|
||||
const result = [];
|
||||
//debugger;
|
||||
for (const node of nodes) {
|
||||
if (node.nodeType === Node.TEXT_NODE && node.nodeValue!=="\n") {
|
||||
result.push(node.nodeValue.replace('\n',''));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
};
|
||||
const parent = el.parentElement;
|
||||
if (!parent) return null;
|
||||
let text = '';
|
||||
const preResult = Array.from(parent.childNodes).find((e) => e.nodeName === 'PRE');
|
||||
if (preResult) {
|
||||
return getText(Array.from(preResult.childNodes));
|
||||
} else {
|
||||
return getText(Array.from(parent.childNodes));
|
||||
}
|
||||
})
|
||||
: null;
|
||||
return textContent?(textContent.length<2?textContent.join():textContent):null
|
||||
}
|
||||
|
||||
async function extractListingData(page: Page): Promise<BusinessListing | null> {
|
||||
const labels = {
|
||||
summaryLabel: 'Summary',
|
||||
descriptionLabel: 'Description',
|
||||
categoryLabel: 'Category:',
|
||||
locationLabel: 'Located in:',
|
||||
askingPriceLabel: 'Asking Price:',
|
||||
realEstateLabel: 'Real Estate Included:',
|
||||
salesRevenueLabel: 'Sales revenue:',
|
||||
cashflowLabel: 'Cash flow:',
|
||||
inventoryLabel: 'Inventory:',
|
||||
brokerLabel: 'Broker licensing:',
|
||||
reasonLabel: 'Reason for sale:',
|
||||
employeesLabel: 'Employees:',
|
||||
};
|
||||
const title = (await page.$eval('div.title', (el) => el.textContent)).trim();
|
||||
|
||||
const content = {};
|
||||
for (const key of Object.values(labels)) {
|
||||
const element = await findElementWithText(page, 'div.sub-title', key);
|
||||
try {
|
||||
content[key] = element ? await getParentElementText(element) : 'N/A';
|
||||
} catch (error) {
|
||||
console.log(`Fehler bei : ${key}`);
|
||||
}
|
||||
}
|
||||
let categoryType
|
||||
if (content['Category:']){
|
||||
categoryType = typesOfBusiness.find((t) => t.name.toLowerCase() === content['Category:'].toLowerCase());
|
||||
} else {
|
||||
console.log(`---> No Category ...`);
|
||||
}
|
||||
if (!categoryType) {
|
||||
console.log(`---> ${content['Category:']}`);
|
||||
}
|
||||
try {
|
||||
const listing = {
|
||||
id: 'NA',
|
||||
userId: '1',
|
||||
listingsCategory: 'business',
|
||||
title: title,
|
||||
summary: Array.isArray(content[labels.summaryLabel])?content[labels.summaryLabel]:[content[labels.summaryLabel]],
|
||||
description: Array.isArray(content[labels.descriptionLabel])?content[labels.descriptionLabel]:[content[labels.descriptionLabel]],
|
||||
type: categoryType.value,
|
||||
location: content[labels.locationLabel],
|
||||
price: currency(content[labels.askingPriceLabel]).value,
|
||||
salesRevenue: currency(content[labels.salesRevenueLabel])?.value,
|
||||
cashFlow: currency(content[labels.cashflowLabel])?.value,
|
||||
brokerLicencing: content[labels.brokerLabel],
|
||||
established: null,
|
||||
realEstateIncluded: content[labels.realEstateLabel] === 'Yes' ? true : false,
|
||||
inventory: content[labels.inventoryLabel],
|
||||
employees: content[labels.employeesLabel],
|
||||
reasonForSale: content[labels.reasonLabel],
|
||||
internals: '',
|
||||
} as BusinessListing;
|
||||
return listing;
|
||||
} catch (error) {
|
||||
console.log(`Fehler bei ${title}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function findElementWithText(page: Page, selector: string, text: string) {
|
||||
const elementHandle = await page.evaluateHandle(
|
||||
(selector, text) => {
|
||||
const elements = Array.from(document.querySelectorAll(selector));
|
||||
return elements.find((element) => element.textContent?.trim() === text);
|
||||
},
|
||||
selector,
|
||||
text
|
||||
);
|
||||
|
||||
return elementHandle;
|
||||
}
|
||||
async function processPage(browser: Browser, url: string,out:Array<any>) {
|
||||
const page = await browser.newPage();
|
||||
|
||||
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
||||
|
||||
const listings = await page.$$('div.ResultsGridItem');
|
||||
for (const listing of listings) {
|
||||
const detailLinkElement = await listing.$('a.viewListing');
|
||||
if (detailLinkElement) {
|
||||
const detailLink = await detailLinkElement.evaluate((el) => el.getAttribute('href'));
|
||||
const detailPage = await browser.newPage();
|
||||
await detailPage.goto(detailLink!, { waitUntil: 'domcontentloaded' });
|
||||
|
||||
const listingData = await extractListingData(detailPage);
|
||||
if (listingData) {
|
||||
console.log(JSON.stringify(listingData));
|
||||
out.push(listingData);
|
||||
}
|
||||
|
||||
await detailPage.close();
|
||||
}
|
||||
}
|
||||
|
||||
const nextPageElement = await page.$('a.next');
|
||||
if (nextPageElement) {
|
||||
let nextPageLink = await nextPageElement.evaluate((el) => el.getAttribute('href'));
|
||||
if (!nextPageLink.startsWith('https')) {
|
||||
const origin = await page.evaluate(() => location.origin);
|
||||
nextPageLink = `${origin}${nextPageLink}`;
|
||||
}
|
||||
await processPage(browser, nextPageLink!, out);
|
||||
}
|
||||
|
||||
await page.close();
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const browser = await puppeteer.launch({ headless: true, executablePath: '/snap/bin/chromium', devtools: true,slowMo: 50 });
|
||||
//const browser = await puppeteer.launch({devtools: true});
|
||||
const out = []
|
||||
await processPage(browser, 'https://www.bizmatch.net/results',out);
|
||||
await fs.writeJson('./listings.json', out)
|
||||
await browser.close();
|
||||
})();
|
||||
Reference in New Issue
Block a user