initial release

This commit is contained in:
2024-02-29 10:23:41 -06:00
commit 5146c8e919
210 changed files with 11040 additions and 0 deletions

17
crawler/.editorconfig Normal file
View File

@@ -0,0 +1,17 @@
# Editor configuration, see https://editorconfig.org
root = true
[*]
charset = utf-8
indent_style = space
indent_size = 4
max_line_length = 180
insert_final_newline = true
trim_trailing_whitespace = true
[*.ts]
quote_type = single
[*.md]
max_line_length = off
trim_trailing_whitespace = false

57
crawler/.vscode/launch.json vendored Normal file
View File

@@ -0,0 +1,57 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"type": "node",
"request": "launch",
"name": "Crawler",
"skipFiles": [
"<node_internals>/**"
],
"program": "${workspaceFolder}/index.js",
"outFiles": [
"${workspaceFolder}/**/*.js"
]
},
{
"type": "node",
"request": "launch",
"name": "Import",
"skipFiles": [
"<node_internals>/**"
],
"program": "${workspaceFolder}/import.js",
"outFiles": [
"${workspaceFolder}/**/*.js"
]
},
{
"type": "node",
"request": "launch",
"name": "ChangeUserId",
"skipFiles": [
"<node_internals>/**"
],
"args": ["--userId","e0811669-c7eb-4e5e-a699-e8334d5c5b01"],
"program": "${workspaceFolder}/changeUserId.js",
"outFiles": [
"${workspaceFolder}/**/*.js"
]
},
{
"type": "node",
"request": "launch",
"name": "updateFields",
"skipFiles": [
"<node_internals>/**"
],
"program": "${workspaceFolder}/updateFields.js",
"outFiles": [
"${workspaceFolder}/**/*.js"
]
},
]
}

30
crawler/changeUserId.ts Normal file
View File

@@ -0,0 +1,30 @@
import yargs from 'yargs'
import fs from 'fs-extra';
import { hideBin } from 'yargs/helpers'
import { BusinessListing } from "../common-models/src/main.model"
const argv = yargs(hideBin(process.argv)).argv
if (!argv.userId){
console.log(' --userId [any valid userId]')
process.exit(1)
}
(async () => {
console
const response = await fetch('http://localhost:3000/bizmatch/listings', {
method: 'GET',
headers: { 'Content-Type': 'application/json' },
})
const listings:Array<BusinessListing> = await response.json();
for (const listing of listings) {
listing.userId=argv.userId;
listing.created=new Date()
listing.updated=new Date()
const response = await fetch(`http://localhost:3000/bizmatch/listings/${listing.id}`, {
method: 'PUT',
body: JSON.stringify(listing),
headers: { 'Content-Type': 'application/json' },
});
}
})();

13
crawler/import.ts Normal file
View File

@@ -0,0 +1,13 @@
import fs from 'fs-extra';
(async () => {
const listings = await fs.readJson('./listings.json');
//listings.forEach(element => {
for (const listing of listings) {
const response = await fetch('http://localhost:3000/bizmatch/listings', {
method: 'POST',
body: JSON.stringify(listing),
headers: { 'Content-Type': 'application/json' },
});
}
})();

172
crawler/index.ts Normal file
View File

@@ -0,0 +1,172 @@
// import puppeteer, { Browser, ElementHandle, Page } from 'puppeteer-core';
import puppeteer, { Browser, ElementHandle, Page } from 'puppeteer';
import { BusinessListing } from "../common-models/src/main.model"
import currency from 'currency.js';
import fs from 'fs-extra'
export interface KeyValue {
name: string;
value: string;
}
const typesOfBusiness: Array<KeyValue> = [
{ name: 'Automotive', value: '1' },
{ name: 'Industrial Services', value: '2' },
{ name: 'Real Estate', value: '3' },
{ name: 'Uncategorized', value: '4' },
{ name: 'Retail', value: '5' },
{ name: 'Oilfield SVE and MFG.', value: '6' },
{ name: 'Service', value: '7' },
{ name: 'Advertising', value: '8' },
{ name: 'Agriculture', value: '9' },
{ name: 'Franchise', value: '10' },
{ name: 'Professional', value: '11' },
{ name: 'Manufacturing', value: '12' },
{ name: 'Food and Restaurant', value: '13' },
];
async function getParentElementText(elementHandle: ElementHandle<Element> | null) {
const textContent = elementHandle
? await elementHandle.evaluate((el) => {
const getText = (nodes: Node[]) => {
const result = [];
//debugger;
for (const node of nodes) {
if (node.nodeType === Node.TEXT_NODE && node.nodeValue!=="\n") {
result.push(node.nodeValue.replace('\n',''));
}
}
return result;
};
const parent = el.parentElement;
if (!parent) return null;
let text = '';
const preResult = Array.from(parent.childNodes).find((e) => e.nodeName === 'PRE');
if (preResult) {
return getText(Array.from(preResult.childNodes));
} else {
return getText(Array.from(parent.childNodes));
}
})
: null;
return textContent?(textContent.length<2?textContent.join():textContent):null
}
async function extractListingData(page: Page): Promise<BusinessListing | null> {
const labels = {
summaryLabel: 'Summary',
descriptionLabel: 'Description',
categoryLabel: 'Category:',
locationLabel: 'Located in:',
askingPriceLabel: 'Asking Price:',
realEstateLabel: 'Real Estate Included:',
salesRevenueLabel: 'Sales revenue:',
cashflowLabel: 'Cash flow:',
inventoryLabel: 'Inventory:',
brokerLabel: 'Broker licensing:',
reasonLabel: 'Reason for sale:',
employeesLabel: 'Employees:',
};
const title = (await page.$eval('div.title', (el) => el.textContent)).trim();
const content = {};
for (const key of Object.values(labels)) {
const element = await findElementWithText(page, 'div.sub-title', key);
try {
content[key] = element ? await getParentElementText(element) : 'N/A';
} catch (error) {
console.log(`Fehler bei : ${key}`);
}
}
let categoryType
if (content['Category:']){
categoryType = typesOfBusiness.find((t) => t.name.toLowerCase() === content['Category:'].toLowerCase());
} else {
console.log(`---> No Category ...`);
}
if (!categoryType) {
console.log(`---> ${content['Category:']}`);
}
try {
const listing = {
id: 'NA',
userId: '1',
listingsCategory: 'business',
title: title,
summary: Array.isArray(content[labels.summaryLabel])?content[labels.summaryLabel]:[content[labels.summaryLabel]],
description: Array.isArray(content[labels.descriptionLabel])?content[labels.descriptionLabel]:[content[labels.descriptionLabel]],
type: categoryType.value,
location: content[labels.locationLabel],
price: currency(content[labels.askingPriceLabel]).value,
salesRevenue: currency(content[labels.salesRevenueLabel])?.value,
cashFlow: currency(content[labels.cashflowLabel])?.value,
brokerLicencing: content[labels.brokerLabel],
established: null,
realEstateIncluded: content[labels.realEstateLabel] === 'Yes' ? true : false,
inventory: content[labels.inventoryLabel],
employees: content[labels.employeesLabel],
reasonForSale: content[labels.reasonLabel],
internals: '',
} as BusinessListing;
return listing;
} catch (error) {
console.log(`Fehler bei ${title}`);
return null;
}
}
async function findElementWithText(page: Page, selector: string, text: string) {
const elementHandle = await page.evaluateHandle(
(selector, text) => {
const elements = Array.from(document.querySelectorAll(selector));
return elements.find((element) => element.textContent?.trim() === text);
},
selector,
text
);
return elementHandle;
}
async function processPage(browser: Browser, url: string,out:Array<any>) {
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'domcontentloaded' });
const listings = await page.$$('div.ResultsGridItem');
for (const listing of listings) {
const detailLinkElement = await listing.$('a.viewListing');
if (detailLinkElement) {
const detailLink = await detailLinkElement.evaluate((el) => el.getAttribute('href'));
const detailPage = await browser.newPage();
await detailPage.goto(detailLink!, { waitUntil: 'domcontentloaded' });
const listingData = await extractListingData(detailPage);
if (listingData) {
console.log(JSON.stringify(listingData));
out.push(listingData);
}
await detailPage.close();
}
}
const nextPageElement = await page.$('a.next');
if (nextPageElement) {
let nextPageLink = await nextPageElement.evaluate((el) => el.getAttribute('href'));
if (!nextPageLink.startsWith('https')) {
const origin = await page.evaluate(() => location.origin);
nextPageLink = `${origin}${nextPageLink}`;
}
await processPage(browser, nextPageLink!, out);
}
await page.close();
}
(async () => {
const browser = await puppeteer.launch({ headless: true, executablePath: '/snap/bin/chromium', devtools: true,slowMo: 50 });
//const browser = await puppeteer.launch({devtools: true});
const out = []
await processPage(browser, 'https://www.bizmatch.net/results',out);
await fs.writeJson('./listings.json', out)
await browser.close();
})();

3035
crawler/listings.json Normal file

File diff suppressed because it is too large Load Diff

23
crawler/package.json Normal file
View File

@@ -0,0 +1,23 @@
{
"name": "git",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"devDependencies": {
"typescript": "^5.2.2"
},
"dependencies": {
"currency.js": "^2.0.4",
"fs-extra": "^11.2.0",
"ioredis": "^5.3.2",
"node-fetch": "^3.3.2",
"puppeteer": "^22.1.0",
"yargs": "^17.7.2"
}
}

109
crawler/tsconfig.json Normal file
View File

@@ -0,0 +1,109 @@
{
"compilerOptions": {
/* Visit https://aka.ms/tsconfig to read more about this file */
/* Projects */
// "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */
// "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */
// "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */
// "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */
// "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
/* Language and Environment */
"target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
// "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
// "jsx": "preserve", /* Specify what JSX code is generated. */
// "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
// "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
// "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
// "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */
// "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */
// "noLib": true, /* Disable including any library files, including the default lib.d.ts. */
// "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
/* Modules */
"module": "commonjs", /* Specify what module code is generated. */
// "rootDir": "./", /* Specify the root folder within your source files. */
// "moduleResolution": "node10", /* Specify how TypeScript looks up a file from a given module specifier. */
// "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
// "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */
// "types": [], /* Specify type package names to be included without being referenced in a source file. */
// "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */
// "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */
// "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */
// "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */
// "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
// "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
// "resolveJsonModule": true, /* Enable importing .json files. */
// "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
/* JavaScript Support */
// "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
// "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */
// "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
/* Emit */
// "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */
// "declarationMap": true, /* Create sourcemaps for d.ts files. */
// "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */
"sourceMap": true, /* Create source map files for emitted JavaScript files. */
// "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */
// "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
// "outDir": "./", /* Specify an output folder for all emitted files. */
// "removeComments": true, /* Disable emitting comments. */
// "noEmit": true, /* Disable emitting files from a compilation. */
// "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
// "importsNotUsedAsValues": "remove", /* Specify emit/checking behavior for imports that are only used for types. */
// "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
// "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */
// "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
// "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */
// "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
// "newLine": "crlf", /* Set the newline character for emitting files. */
// "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */
// "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */
// "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */
// "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */
// "declarationDir": "./", /* Specify the output directory for generated declaration files. */
// "preserveValueImports": true, /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */
/* Interop Constraints */
// "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */
// "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */
"allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
"esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
"forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */
/* Type Checking */
"strict": false, /* Enable all strict type-checking options. */
// "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */
// "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */
// "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
// "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */
// "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */
// "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */
// "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */
// "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */
// "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */
// "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */
// "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */
// "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */
// "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */
// "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */
// "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */
// "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */
// "allowUnusedLabels": true, /* Disable error reporting for unused labels. */
// "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */
/* Completeness */
// "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
"skipLibCheck": true /* Skip type checking all .d.ts files. */
}
}

34
crawler/updateFields.ts Normal file
View File

@@ -0,0 +1,34 @@
import yargs from 'yargs'
import fs from 'fs-extra';
import { hideBin } from 'yargs/helpers'
import { BusinessListing } from "../common-models/src/main.model"
//const argv = yargs(hideBin(process.argv)).argv
// if (!argv.userId){
// console.log(' --userId [any valid userId]')
// process.exit(1)
// }
(async () => {
const selectOptionsResponse = await fetch('http://localhost:3000/bizmatch/select-options', {
method: 'GET',
headers: { 'Content-Type': 'application/json' },
})
const selectOptions:any = await selectOptionsResponse.json();
const response = await fetch('http://localhost:3000/bizmatch/listings', {
method: 'GET',
headers: { 'Content-Type': 'application/json' },
})
const listings:Array<BusinessListing> = await response.json();
for (const listing of listings) {
const option = selectOptions.locations.find(l=>l.name.toLowerCase()===listing.location.toLowerCase());
if (option){
listing.location=option.value
}
const response = await fetch(`http://localhost:3000/bizmatch/listings/${listing.id}`, {
method: 'PUT',
body: JSON.stringify(listing),
headers: { 'Content-Type': 'application/json' },
});
}
})();