Java 波特斯特默代码

声明:本页面是StackOverFlow热门问题的中英对照翻译,遵循CC BY-SA 4.0协议,如果您需要使用它,必须同样遵循CC BY-SA许可,注明原文地址和作者信息,同时你必须将它归于原作者(不是我):StackOverFlow 原文地址: http://stackoverflow.com/questions/9756653/
Warning: these are provided under cc-by-sa 4.0 license. You are free to use/share it, But you must attribute it to the original authors (not me): StackOverFlow

提示:将鼠标放在中文语句上可以显示对应的英文。显示中英文
时间:2020-08-16 07:14:31  来源:igfitidea点击:

Porter Stemmer code

javanlpporter-stemmer

提问by user872009

I am kinda new to java. I am taking a course in NLP. I wanna know how I can run an input file on the porter stemmer java code.

我对java有点陌生。我正在学习 NLP 课程。我想知道如何在 porter stemmer java 代码上运行输入文件。

回答by agarwav

The class below is named PorterAlgo and has various functions for stemming.

下面的类被命名为 PorterAlgo 并且具有用于词干提取的各种函数。

package com.mycompany.algo;

class NewString {
  public String str;

  NewString() {
     str = "";
  }
}

public class PorterAlgo {

  String Clean( String str ) {
     int last = str.length();

     new Character( str.charAt(0) );
     String temp = "";

     for ( int i=0; i < last; i++ ) {
         if ( Character.isLetterOrDigit( str.charAt(i) ) )
            temp += str.charAt(i);
     }

     return temp;
  } //clean

  boolean hasSuffix( String word, String suffix, NewString stem ) {

     String tmp = "";

     if ( word.length() <= suffix.length() )
        return false;
     if (suffix.length() > 1) 
        if ( word.charAt( word.length()-2 ) != suffix.charAt( suffix.length()-2 ) )
           return false;

     stem.str = "";

     for ( int i=0; i<word.length()-suffix.length(); i++ )
         stem.str += word.charAt( i );
     tmp = stem.str;

     for ( int i=0; i<suffix.length(); i++ )
         tmp += suffix.charAt( i );

     if ( tmp.compareTo( word ) == 0 )
        return true;
     else
        return false;
  }

  boolean vowel( char ch, char prev ) {
     switch ( ch ) {
        case 'a': case 'e': case 'i': case 'o': case 'u': 
  return true;
case 'y': {

  switch ( prev ) {
    case 'a': case 'e': case 'i': case 'o': case 'u': 
              return false;

            default: 
              return true;
          }
        }

        default : 
          return false;
     }
  }

  int measure( String stem ) {

    int i=0, count = 0;
    int length = stem.length();

    while ( i < length ) {
       for ( ; i < length ; i++ ) {
           if ( i > 0 ) {
              if ( vowel(stem.charAt(i),stem.charAt(i-1)) )
                 break;
           }
           else {  
              if ( vowel(stem.charAt(i),'a') )
            break; 
       }
   }

   for ( i++ ; i < length ; i++ ) {
       if ( i > 0 ) {
          if ( !vowel(stem.charAt(i),stem.charAt(i-1)) )
              break;
          }
       else {  
          if ( !vowel(stem.charAt(i),'?') )
             break;
       }
   } 
  if ( i < length ) {
     count++;
     i++;
  }
} //while

    return(count);
  }

  boolean containsVowel( String word ) {

     for (int i=0 ; i < word.length(); i++ )
         if ( i > 0 ) {
            if ( vowel(word.charAt(i),word.charAt(i-1)) )
               return true;
         }
         else {  
            if ( vowel(word.charAt(0),'a') )
               return true;
         }

     return false;
  }

  boolean cvc( String str ) {
     int length=str.length();

     if ( length < 3 )
        return false;

     if ( (!vowel(str.charAt(length-1),str.charAt(length-2)) )
        && (str.charAt(length-1) != 'w') && (str.charAt(length-1) != 'x') && (str.charAt(length-1) != 'y')
&& (vowel(str.charAt(length-2),str.charAt(length-3))) ) {

if (length == 3) {
   if (!vowel(str.charAt(0),'?')) 
              return true;
           else
              return false;
        }
        else {
           if (!vowel(str.charAt(length-3),str.charAt(length-4)) ) 
              return true; 
           else
              return false;
        } 
     }   

     return false;
  }

  String step1( String str ) {

     NewString stem = new NewString();

     if ( str.charAt( str.length()-1 ) == 's' ) {
if ( (hasSuffix( str, "sses", stem )) || (hasSuffix( str, "ies", stem)) ){
   String tmp = "";
   for (int i=0; i<str.length()-2; i++)
       tmp += str.charAt(i);
   str = tmp;
}
else {
   if ( ( str.length() == 1 ) && ( str.charAt(str.length()-1) == 's' ) ) {
      str = "";
      return str;
   }
   if ( str.charAt( str.length()-2 ) != 's' ) {
      String tmp = "";
          for (int i=0; i<str.length()-1; i++)
              tmp += str.charAt(i);
          str = tmp;
       }
    }  
 }

 if ( hasSuffix( str,"eed",stem ) ) {
   if ( measure( stem.str ) > 0 ) {
      String tmp = "";
          for (int i=0; i<str.length()-1; i++)
              tmp += str.charAt( i );
          str = tmp;
       }
 }
 else {  
    if (  (hasSuffix( str,"ed",stem )) || (hasSuffix( str,"ing",stem )) ) { 
   if (containsVowel( stem.str ))  {

      String tmp = "";
      for ( int i = 0; i < stem.str.length(); i++)
          tmp += str.charAt( i );
      str = tmp;
      if ( str.length() == 1 )
         return str;

      if ( ( hasSuffix( str,"at",stem) ) || ( hasSuffix( str,"bl",stem ) ) || ( hasSuffix( str,"iz",stem) ) ) {
         str += "e";

      }
      else {   
         int length = str.length(); 
         if ( (str.charAt(length-1) == str.charAt(length-2)) 
            && (str.charAt(length-1) != 'l') && (str.charAt(length-1) != 's') && (str.charAt(length-1) != 'z') ) {

            tmp = "";
            for (int i=0; i<str.length()-1; i++)
                tmp += str.charAt(i);
            str = tmp;
         }
         else
            if ( measure( str ) == 1 ) {
               if ( cvc(str) ) 
                  str += "e";
                }
          }
       }
    }
 }

 if ( hasSuffix(str,"y",stem) ) 
if ( containsVowel( stem.str ) ) {
   String tmp = "";
   for (int i=0; i<str.length()-1; i++ )
       tmp += str.charAt(i);
   str = tmp + "i";
        }
     return str;  
  }

  String step2( String str ) {

     String[][] suffixes = { { "ational", "ate" },
                            { "tional",  "tion" },
                            { "enci",    "ence" },
                            { "anci",    "ance" },
                            { "izer",    "ize" },
                            { "iser",    "ize" },
                            { "abli",    "able" },
                            { "alli",    "al" },
                            { "entli",   "ent" },
                            { "eli",     "e" },
                            { "ousli",   "ous" },
                            { "ization", "ize" },
                            { "isation", "ize" },
                            { "ation",   "ate" },
                            { "ator",    "ate" },
                            { "alism",   "al" },
                            { "iveness", "ive" },
                            { "fulness", "ful" },
                            { "ousness", "ous" },
                            { "aliti",   "al" },
                            { "iviti",   "ive" },
                            { "biliti",  "ble" }};
     NewString stem = new NewString();


     for ( int index = 0 ; index < suffixes.length; index++ ) {
         if ( hasSuffix ( str, suffixes[index][0], stem ) ) {
            if ( measure ( stem.str ) > 0 ) {
               str = stem.str + suffixes[index][1];
               return str;
            }
         }
     }

     return str;
  }

  String step3( String str ) {

        String[][] suffixes = { { "icate", "ic" },
                               { "ative", "" },
                               { "alize", "al" },
                               { "alise", "al" },
                               { "iciti", "ic" },
                               { "ical",  "ic" },
                               { "ful",   "" },
                               { "ness",  "" }};
        NewString stem = new NewString();

        for ( int index = 0 ; index<suffixes.length; index++ ) {
            if ( hasSuffix ( str, suffixes[index][0], stem ))
               if ( measure ( stem.str ) > 0 ) {
                  str = stem.str + suffixes[index][1];
                  return str;
               }
        }
        return str;
  }

  String step4( String str ) {

     String[] suffixes = { "al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent", "sion", "tion",
                   "ou", "ism", "ate", "iti", "ous", "ive", "ize", "ise"};

     NewString stem = new NewString();

     for ( int index = 0 ; index<suffixes.length; index++ ) {
         if ( hasSuffix ( str, suffixes[index], stem ) ) {

            if ( measure ( stem.str ) > 1 ) {
               str = stem.str;
               return str;
            }
         }
     }
     return str;
  }

  String step5( String str ) {

     if ( str.charAt(str.length()-1) == 'e' ) { 
if ( measure(str) > 1 ) {/* measure(str)==measure(stem) if ends in vowel */
   String tmp = "";
   for ( int i=0; i<str.length()-1; i++ ) 
       tmp += str.charAt( i );
   str = tmp;
}
else
   if ( measure(str) == 1 ) {
      String stem = "";
          for ( int i=0; i<str.length()-1; i++ ) 
              stem += str.charAt( i );

          if ( !cvc(stem) )
             str = stem;
       }
 }

 if ( str.length() == 1 )
    return str;
 if ( (str.charAt(str.length()-1) == 'l') && (str.charAt(str.length()-2) == 'l') && (measure(str) > 1) )
if ( measure(str) > 1 ) {/* measure(str)==measure(stem) if ends in vowel */
   String tmp = "";
           for ( int i=0; i<str.length()-1; i++ ) 
               tmp += str.charAt( i );
           str = tmp;
        } 
     return str;
  }

  String stripPrefixes ( String str) {

     String[] prefixes = { "kilo", "micro", "milli", "intra", "ultra", "mega", "nano", "pico", "pseudo"};

 int last = prefixes.length;
 for ( int i=0 ; i<last; i++ ) {
     if ( str.startsWith( prefixes[i] ) ) {
        String temp = "";
            for ( int j=0 ; j< str.length()-prefixes[i].length(); j++ )
                temp += str.charAt( j+prefixes[i].length() );
            return temp;
         }
     }

     return str;
  }


  private String stripSuffixes( String str ) {

     str = step1( str );
     if ( str.length() >= 1 )
        str = step2( str );
     if ( str.length() >= 1 )
        str = step3( str );
     if ( str.length() >= 1 )
        str = step4( str );
     if ( str.length() >= 1 )
        str = step5( str );

     return str; 
  }


  public String stripAffixes( String str ) {

    str = str.toLowerCase();
    str = Clean(str);

    if (( str != "" ) && (str.length() > 2)) {
   str = stripPrefixes(str);

   if (str != "" ) 
      str = stripSuffixes(str);

}   

return str;
} //stripAffixes

} //class

Given below is a class PorterCheck.java

下面给出的是一个类 PorterCheck.java

package com.mycompany.algo;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.*;

public class PorterCheck {
    private static final String DEFAULT_TEST_FILE = "C:/Users/vaibhav/Desktop/rev.txt";
    public static void main(String args[]) throws IOException{
        PorterAlgo pa = new PorterAlgo();

        //checks for vowels in a given string
        System.out.println(pa.containsVowel("vaibhav"));

        //removes special characters
        System.out.println(pa.Clean("vaibhav's book"));

        //check for a given suffix
        NewString stem = new NewString();
        System.out.println(pa.hasSuffix("corresponding","ing",stem));

        //stemming the words
        ArrayList<String> tok = new ArrayList<String>();
        String[] tokens = {"normalize","technical","education"};
        for (String x: tokens){
            tok.add(x);
        }
        System.out.println(completeStem(tok));

        String fileName = ((args.length > 0) ? args[0] : DEFAULT_TEST_FILE);
        FileReader fileReader = new FileReader(new File(fileName));
        FileTokenizer fileTokenizer = new FileTokenizer();
        List<String> tokens1 = fileTokenizer.tokenize(fileReader);

        System.out.println("Tokenizing the input file:");
        System.out.print(completeStem(tokens1));
    }

    //method to completely stem the words in an array list
    public static ArrayList<String> completeStem(List<String> tokens1){
        PorterAlgo pa = new PorterAlgo();
        ArrayList<String> arrstr = new ArrayList<String>();
        for (String i : tokens1){
            String s1 = pa.step1(i);
            String s2 = pa.step2(s1);
            String s3= pa.step3(s2);
            String s4= pa.step4(s3);
            String s5= pa.step5(s4);
            arrstr.add(s5);
        }
        return arrstr;
    }

    //method to tokenize a file
    public static ArrayList<String> fileTokenizer(){
        StringTokenizer strtoken = new StringTokenizer("this is a book");
        ArrayList<String> filetoken = new ArrayList<String>();
        while(strtoken.hasMoreElements()){
            filetoken.add(strtoken.nextToken());
        }
        return filetoken;
    }
}

Hope this helps you :D

希望这对你有帮助:D