Dive into secure and efficient coding practices with our curated list of the top 10 examples showcasing 'htmlparser2' in functional components in JavaScript. Our advanced machine learning engine meticulously scans each line of code, cross-referencing millions of open source libraries to ensure your implementation is not just functional, but also robust and secure. Elevate your React applications to new heights by mastering the art of handling side effects, API calls, and asynchronous operations with confidence and precision.
async function requestParseSearchItems ({requestOptions, xpath}) {
try {
const rsp = await request(requestOptions)
// 用htmlparser2转换一次再解析
let outerHTML = htmlparser2.DomUtils.getOuterHTML(htmlparser2.parseDOM(rsp))
const document = domParser.parseFromString(outerHTML)
return {items: parseDocument(document, xpath)}
} catch (e) {
console.error('解析失败', e)
return {err: e}
}
}
if (!contentDiv) {
throw new Error("Article content not found (no 'region-content' class)");
}
// remove article body-enclosing div (class "threed-sidebar-article-body"), then re-parent children
const bodyDiv = DomUtils.findOne(elem =>
elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("threed-sidebar-article-body") >= 0,
contentDiv.children, true);
if (bodyDiv) {
const parent: any = bodyDiv.parent;
bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));
DomUtils.removeElement(bodyDiv);
}
const title = DomUtils.findOne(elem => elem.name === "h1",
contentDiv.children, true);
const titleText = title && DomUtils.getText(title);
article.title = titleText || `Article No. ${index + 1}`;
let imageIndex = 0;
const imageUrls: Dictionary = {};
DomUtils.findOne(elem => {
// download images
if (elem.name === "img" && elem.attribs && elem.attribs.src) {
const src = elem.attribs.src;
const imageUrl = src.startsWith("http") ? src : this.parameters.drupalBaseUrl + src;
const imageName = filenamify(decodeURIComponent(src.split("/").pop()));
const imageFileName = `article-${articleIndex}-${imageName}`;
const imageAssetPath = `${this.articlesDir}/${imageFileName}`;
export async function fetchArticle(context: IPlayContext, article: IArticle, url: string, index: number): Promise
{
const articleIndex = index.toString().padStart(2, "0");
console.log(`fetchArticle - fetching HTML from ${url}`);
const pageHtml = await fetch.text(url, "GET");
// parse the article's HTML content
const handler = new DomHandler();
const parser = new Parser(handler);
parser.write(pageHtml);
parser.done();
const dom = handler.dom;
// find parent of article content
const contentDiv = DomUtils.findOne(elem =>
elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("region-content") >=0,
dom, true);
if (!contentDiv) {
throw new Error("Article content not found (no 'region-content' class)");
}
// remove article body-enclosing div (class "threed-sidebar-article-body"), then re-parent children
const bodyDiv = DomUtils.findOne(elem =>
elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("threed-sidebar-article-body") >= 0,
contentDiv.children, true);
if (bodyDiv) {
const parent: any = bodyDiv.parent;
bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));
DomUtils.removeElement(bodyDiv);
async fetchArticle(article: IArticle, url: string, index: number): Promise
{
const articleIndex = index.toString().padStart(2, "0");
console.log(`fetchArticle - fetching HTML from ${url}`);
const pageHtml = await fetch.text(url, "GET");
// parse the article's HTML content
const handler = new DomHandler();
const parser = new Parser(handler);
parser.write(pageHtml);
parser.done();
const dom = handler.dom;
// find parent of article content
const contentDiv = DomUtils.findOne(elem =>
elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("region-content") >=0,
dom, true);
if (!contentDiv) {
throw new Error("Article content not found (no 'region-content' class)");
}
// remove article body-enclosing div (class "threed-sidebar-article-body"), then re-parent children
const bodyDiv = DomUtils.findOne(elem =>
elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("threed-sidebar-article-body") >= 0,
contentDiv.children, true);
if (bodyDiv) {
const parent: any = bodyDiv.parent;
bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));
DomUtils.removeElement(bodyDiv);
if (bodyDiv) {
const parent: any = bodyDiv.parent;
bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));
DomUtils.removeElement(bodyDiv);
}
const title = DomUtils.findOne(elem => elem.name === "h1",
contentDiv.children, true);
const titleText = title && DomUtils.getText(title);
article.title = titleText || `Article No. ${index + 1}`;
let imageIndex = 0;
const imageUrls: Dictionary = {};
DomUtils.findOne(elem => {
// download images
if (elem.name === "img" && elem.attribs && elem.attribs.src) {
const src = elem.attribs.src;
const imageUrl = src.startsWith("http") ? src : context.drupalBaseUrl + src;
const imageName = filenamify(decodeURIComponent(src.split("/").pop()));
const imageFileName = `article-${articleIndex}-${imageName}`;
const imageAssetPath = `${context.articleDir}/${imageFileName}`;
context.files[imageAssetPath] = imageAssetPath;
elem.attribs.src = imageFileName; // relative to location of html file
imageUrls[imageUrl] = imageAssetPath;
imageIndex++;
}
// remove additional classes from all nodes
if (elem.attribs && elem.attribs.class) {
if (!contentDiv) {
throw new Error("Article content not found (no 'region-content' class)");
}
// remove article body-enclosing div (class "threed-sidebar-article-body"), then re-parent children
const bodyDiv = DomUtils.findOne(elem =>
elem.attribs && elem.attribs.class && elem.attribs.class.indexOf("threed-sidebar-article-body") >= 0,
contentDiv.children, true);
if (bodyDiv) {
const parent: any = bodyDiv.parent;
bodyDiv.children.forEach(child => DomUtils.appendChild(parent, child));
DomUtils.removeElement(bodyDiv);
}
const title = DomUtils.findOne(elem => elem.name === "h1",
contentDiv.children, true);
const titleText = title && DomUtils.getText(title);
article.title = titleText || `Article No. ${index + 1}`;
let imageIndex = 0;
const imageUrls: Dictionary = {};
DomUtils.findOne(elem => {
// download images
if (elem.name === "img" && elem.attribs && elem.attribs.src) {
const src = elem.attribs.src;
const imageUrl = src.startsWith("http") ? src : context.drupalBaseUrl + src;
const imageName = filenamify(decodeURIComponent(src.split("/").pop()));
const imageFileName = `article-${articleIndex}-${imageName}`;
const imageAssetPath = `${context.articleDir}/${imageFileName}`;
}, function (body) {
var handler = new htmlparser.DefaultHandler()
var tokenizer = new (require('./parser'))
var parser = new htmlparser.Parser(handler);
tokenizer._cbs = new TokenizerProxy(parser._tokenizer._cbs)
parser._tokenizer = tokenizer
parser.parseComplete(body)
//console.log('=======')
// great. now it's time for a serializer.
//console.log( domutils.getOuterHTML(handler.dom[0]))
//console.log('=======')
//console.log(require('util').inspect(handler.dom[0], false, null))
var actual = new (xmldom.DOMParser)().parseFromString('')
actual.documentElement.parentNode.removeChild(actual.documentElement)
createXMLTemplate(actual, handler.dom[0])
// Why? Because. Because namespaces. Hateful namespaces.
var actual = new (xmldom.DOMParser)().parseFromString(actual.toString())
}, contentDiv.children, true);
// fetch all images
const urls = Object.keys(imageUrls);
const promises: Promise[] = urls.map(url => {
console.log(`fetchArticle - fetching image from ${url}`);
return fetch.buffer(url, "GET").then(image => {
const imageFileName = imageUrls[url];
const imageFilePath = this.getFilePath(imageFileName);
console.log(`fetchArticle - writing image to ${imageFilePath}`);
return fs.writeFile(imageFilePath, Buffer.from(image))
});
});
// write article HTML content
const contentHtml = DomUtils.getInnerHTML(contentDiv);
const articleFileName = `${this.articlesDir}/article-${articleIndex}.html`;
this.result.files[`scene_${articleFileName}`] = articleFileName;
const articleFilePath = this.getFilePath(articleFileName);
promises.push(fs.writeFile(articleFilePath, contentHtml));
return Promise.all(promises);
}
}, contentDiv.children, true);
// fetch all images
const urls = Object.keys(imageUrls);
const promises: Promise[] = urls.map(url => {
console.log(`fetchArticle - fetching image from ${url}`);
return fetch.buffer(url, "GET").then(image => {
const imageFileName = imageUrls[url];
const imageFilePath = path.resolve(context.job.jobDir, imageFileName);
console.log(`fetchArticle - writing image to ${imageFilePath}`);
return fs.writeFile(imageFilePath, Buffer.from(image))
});
});
// write article HTML content
const contentHtml = DomUtils.getInnerHTML(contentDiv);
const articleFileName = `${context.articleDir}/article-${articleIndex}.html`;
context.files[articleFileName] = articleFileName;
const articleFilePath = path.resolve(context.job.jobDir, articleFileName);
promises.push(fs.writeFile(articleFilePath, contentHtml));
return Promise.all(promises);
}
usedComponents () {
let tags = []
DomUtils.find((el) => {
let { name, attribs = {} } = el
// 记录所有非原生组件名
if (name && !isNativeTag(name)) {
tags.push(name)
}
let attrKeys = Object.keys(attribs)
/**
* 使用自定义组件是抽象组件
*/
if (/generic:/.test(attrKeys.join(';'))) {
attrKeys.forEach(key => {
/generic:/.test(key) && tags.push(attribs[key])
})