用于过滤字符串中常见单词的 JavaScript 代码

Question

提问by Test Tester

I'm trying to build JavaScript code that reads one string (say a sentence of English text), then outputs another string of (comma-separated) words that were "uncommon". Something like:

我正在尝试构建 JavaScript 代码来读取一个字符串（比如一个英文文本的句子），然后输出另一串（逗号分隔的）“不常见”的单词。就像是：

    var sentence="The dog ran to the other side of the field."; 

    var common_words="the, it is, we all, a, an, by, to, you, me, he, she, they, we, how, it, i, are, to, for, of";

--Some JavaScript code--

-- 一些 JavaScript 代码--

    var uncommon_words="dog, ran, other, side, field";

How can I do this?

我怎样才能做到这一点？

Answer 1

回答by ?ime Vidas

Here you go:

干得好：

function getUncommon(sentence, common) {
    var wordArr = sentence.match(/\w+/g),
        commonObj = {},
        uncommonArr = [],
        word, i;

    common = common.split(',');
    for ( i = 0; i < common.length; i++ ) {
        commonObj[ common[i].trim() ] = true;
    }

    for ( i = 0; i < wordArr.length; i++ ) {
        word = wordArr[i].trim().toLowerCase();
        if ( !commonObj[word] ) {
            uncommonArr.push(word);
        }
    }

    return uncommonArr;
}

Live demo:http://jsfiddle.net/simevidas/knXkS/

现场演示：http : //jsfiddle.net/simevidas/knXkS/

Answer 2

回答by Fareed Alnamrouti

the words you want to remove is called stop words witch is:

您要删除的词称为停用词，女巫是：

["a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your", "ain't", "aren't", "can't", "could've", "couldn't", "didn't", "doesn't", "don't", "hasn't", "he'd", "he'll", "he's", "how'd", "how'll", "how's", "i'd", "i'll", "i'm", "i've", "isn't", "it's", "might've", "mightn't", "must've", "mustn't", "shan't", "she'd", "she'll", "she's", "should've", "shouldn't", "that'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "wasn't", "we'd", "we'll", "we're", "weren't", "what'd", "what's", "when'd", "when'll", "when's", "where'd", "where'll", "where's", "who'd", "who'll", "who's", "why'd", "why'll", "why's", "won't", "would've", "wouldn't", "you'd", "you'll", "you're", "you've"]

here is the source: http://99webtools.com/list-english-stop-words.php

这是来源：http: //99webtools.com/list-english-stop-words.php

so your code should be

所以你的代码应该是

function getNoneStopWords(sentence) {
        var common = getStopWords();
        var wordArr = sentence.match(/\w+/g),
            commonObj = {},
            uncommonArr = [],
            word, i;

        for (i = 0; i < common.length; i++) {
            commonObj[ common[i].trim() ] = true;
        }

        for (i = 0; i < wordArr.length; i++) {
            word = wordArr[i].trim().toLowerCase();
            if (!commonObj[word]) {
                uncommonArr.push(word);
            }
        }
        return uncommonArr;
    }

    function getStopWords() {
        return ["a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your", "ain't", "aren't", "can't", "could've", "couldn't", "didn't", "doesn't", "don't", "hasn't", "he'd", "he'll", "he's", "how'd", "how'll", "how's", "i'd", "i'll", "i'm", "i've", "isn't", "it's", "might've", "mightn't", "must've", "mustn't", "shan't", "she'd", "she'll", "she's", "should've", "shouldn't", "that'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "wasn't", "we'd", "we'll", "we're", "weren't", "what'd", "what's", "when'd", "when'll", "when's", "where'd", "where'll", "where's", "who'd", "who'll", "who's", "why'd", "why'll", "why's", "won't", "would've", "wouldn't", "you'd", "you'll", "you're", "you've"];
    }

Answer 3

回答by Mario

How about this?

这个怎么样？

sentence.replace(/\b(?:the|it is|we all|an?|by|to|you|[mh]e|she|they|we...)\b/ig, '');

This should remove all common words from your sentence. Just split the remaining string the way you want.

这应该从你的句子中删除所有常用词。只需按照您想要的方式拆分剩余的字符串。

Answer 4

回答by Max

Build an associative array of common words first, then tokenize sequence to output any words not contained in it. E.g.

首先构建一个常用词的关联数组，然后对序列进行标记化以输出其中未包含的任何词。例如

var excluded = new Object();
common_words = common_words.split(",");
for (var i in common_words) {
    excluded[common_words[i].trim().toLowerCase()] = true;
}
var result = new Array();
var match = sentence.match(/\w+/g);
for (var i in match) {
    if (!excluded[match[i].toLowerCase()]) {
        result.push(match[i]);
    }
}
var uncommon_words = result.join(", ");

Answer 5

回答by Simon Scarfe

Here's a start, I reckon:

这是一个开始，我认为：

var sentence_arr = sentence.split(/(?=\w)\b|\W/);
var common_arr = common_words.split(', ');

var uncommon_arr = array();
for(var i = 0; i < sentence_arr.length; i++) {
    for ( var j = 0; j < common_arr.length; j++ ) {
        if ( sentence_arr[i].toLowerCase() != common_arr[j].toLowerCase() ) {
            uncommon_arr.push(sentence_arr[i].toLowerCase());
    }
}

var uncommon_words = uncommon_arr.join(', ');

completely untested, but the point is you split both sentences and individually check each word against each member of that list. Kinda naive, and totally doesn't scale, but would be fine with small examples such as this.

完全未经测试，但关键是您拆分两个句子并针对该列表的每个成员单独检查每个单词。有点天真，完全不能扩展，但是对于这样的小例子就可以了。

Answer 6

回答by Wil Moore III

The String#difffunction returns a list of differences (uncommon terms). The terms can be provided as an array or a string.

该String#diff函数返回一个差异列表（不常见的术语）。这些术语可以作为数组或字符串提供。

You call it like: sentence.diff(terms). Below is a unit test:

你这样称呼它：sentence.diff(terms)。下面是一个单元测试：

var sentence = 'The dog ran to the other side of the field.';
var terms    = 'the, it is, we all, a, an, by, to, you, me, he, she, they, we, how, it, i, are, to, for, of';
// NOTE: The "terms" variable could also be an array.

(sentence.diff(terms).toString() === 'dog,ran,other,side,field')
  ? console.log('pass')
  : console.log('fail');

Below is the 'String.diff' function definition:

下面是“ String.diff”函数定义：

String.prototype.diff = function(terms){
  if (!terms) {
    return [];
  }

  if (typeof terms === 'string') {
    terms = terms.split(/,[\s]*/);
  }

  if (typeof terms !== 'object' || !Array.isArray(terms)) {
    return [];
  }

  terms = terms.map(function(term){
    return term.toLowerCase();
  });

  var words = this.split(/[\W]/).filter(function(word){
    return word.length;
  });

  return words.filter(function(word){
    return terms.indexOf(word.toLowerCase()) < 0;
  });
};

用于过滤字符串中常见单词的 JavaScript 代码

提问by Test Tester

回答by ?ime Vidas

回答by Fareed Alnamrouti

回答by Mario

回答by Max

回答by Simon Scarfe

回答by Wil Moore III

相关推荐

最近更新

标签

用于过滤字符串中常见单词的 JavaScript 代码

提问by Test Tester

回答by ?ime Vidas

回答by Fareed Alnamrouti

回答by Mario

回答by Max

回答by Simon Scarfe

回答by Wil Moore III

相关推荐

通过 Javascript 将值传递给控制器​​返回 View MVC3 Razor

javascript 如何使谷歌地图默认指向美国？

javascript 在 jQuery 对象上保存和恢复“onclick”操作

javascript 从 div ID 获取所有图像并添加链接

相关推荐

最近更新

标签

通过 Javascript 将值传递给控制器返回 View MVC3 Razor