使用 JavaScript 从 HTML 字符串中提取文本
声明:本页面是StackOverFlow热门问题的中英对照翻译,遵循CC BY-SA 4.0协议,如果您需要使用它,必须同样遵循CC BY-SA许可,注明原文地址和作者信息,同时你必须将它归于原作者(不是我):StackOverFlow
原文地址: http://stackoverflow.com/questions/28899298/
Warning: these are provided under cc-by-sa 4.0 license. You are free to use/share it, But you must attribute it to the original authors (not me):
StackOverFlow
Extract the text out of HTML string using JavaScript
提问by Toshkuuu
I am trying to get the inner text of HTML string, using a JS function(the string is passed as an argument). Here is the code:
我正在尝试使用 JS 函数(字符串作为参数传递)获取 HTML 字符串的内部文本。这是代码:
function extractContent(value) {
var content_holder = "";
for (var i = 0; i < value.length; i++) {
if (value.charAt(i) === '>') {
continue;
while (value.charAt(i) != '<') {
content_holder += value.charAt(i);
}
}
}
console.log(content_holder);
}
extractContent("<p>Hello</p><a href='http://w3c.org'>W3C</a>");
The problem is that nothing gets printed on the console(*content_holder* stays empty). I think the problem is caused by the ===operator.
问题是在console(*content_holder* stays empty). 我认为问题是由===操作员引起的。
回答by Rick Hitchcock
Create an element, store the HTML in it, and get its textContent:
创建一个元素,在其中存储 HTML,并获取它的textContent:
function extractContent(s) {
var span = document.createElement('span');
span.innerHTML = s;
return span.textContent || span.innerText;
};
alert(extractContent("<p>Hello</p><a href='http://w3c.org'>W3C</a>"));
Here's a version that allows you to have spaces between nodes, although you'd probably want that for block-level elements only:
这是一个允许您在节点之间有空格的版本,尽管您可能只希望对块级元素使用空格:
function extractContent(s, space) {
var span= document.createElement('span');
span.innerHTML= s;
if(space) {
var children= span.querySelectorAll('*');
for(var i = 0 ; i < children.length ; i++) {
if(children[i].textContent)
children[i].textContent+= ' ';
else
children[i].innerText+= ' ';
}
}
return [span.textContent || span.innerText].toString().replace(/ +/g,' ');
};
console.log(extractContent("<p>Hello</p><a href='http://w3c.org'>W3C</a>. Nice to <em>see</em><strong><em>you!</em></strong>"));
console.log(extractContent("<p>Hello</p><a href='http://w3c.org'>W3C</a>. Nice to <em>see</em><strong><em>you!</em></strong>",true));
回答by Rick Hitchcock
One line (more precisely, one statement) version:
一行(更准确地说,一个语句)版本:
function extractContent(html) {
return (new DOMParser).parseFromString(html, "text/html") .
documentElement . textContent;
}
回答by Mubeen Khan
textContextis a very good technique for achieving desired results but sometimes we don't want to load DOM. So simple workaround will be following regular expression:
textContext是一种非常好的实现预期结果的技术,但有时我们不想加载 DOM。如此简单的解决方法将遵循正则表达式:
let htmlString = "<p>Hello</p><a href='http://w3c.org'>W3C</a>"
let plainText = htmlString.replace(/<[^>]+>/g, '');
回答by Ahmer
use this regax for remove html tags and store only the inner text in html
使用此正则表达式删除 html 标签并仅将内部文本存储在 html 中
it shows the HelloW3c only check it
它显示 HelloW3c 只检查它
var content_holder = value.replace(/<(?:.|\n)*?>/gm, '');
回答by Sharique Ansari
Try This:-
尝试这个:-
<!DOCTYPE html>
<html>
<body>
<script type="text/javascript">
function extractContent(value){
var div = document.createElement('div')
div.innerHTML=value;
var text= div.textContent;
return text;
}
window.onload=function()
{
alert(extractContent("<p>Hello</p><a href='http://w3c.org'>W3C</a>"));
};
</script>
</body>
</html>
回答by Adam MacDonald
You could temporarily write it out to a block level element that is positioned off the page .. some thing like this:
你可以暂时把它写到一个位于页面之外的块级元素......像这样:
HTML:
HTML:
<div id="tmp" style="position:absolute;top:-400px;left:-400px;">
</div>
JavaScript:
JavaScript:
<script type="text/javascript">
function extractContent(value){
var div=document.getElementById('tmp');
div.innerHTML=value;
console.log(div.children[0].innerHTML);//console out p
}
extractContent("<p>Hello</p><a href='http://w3c.org'>W3C</a>");
</script>
回答by Dane
you need array to hold values
你需要数组来保存值
function extractContent(value) {
var content_holder = new Array();
for(var i=0;i<value.length;i++) {
if(value.charAt(i) === '>') {
continue;
while(value.charAt(i) != '<') {
content_holder.push(value.charAt(i));
console.log(content_holder[i]);
}
}
}
}extractContent("<p>Hello</p><a href='http://w3c.org'>W3C</a>");

