现在的位置: 首页 > 综合 > 正文

使用MapReduce程序对KDD Cup 99数据集进行信息检索(二)

2013年09月17日 ⁄ 综合 ⁄ 共 4932字 ⁄ 字号 评论关闭

不知道怎么回事,刚才写好的文章可能字数太多?还是图片太多?导致我文章的后半部分没有了。所以接着写第二篇吧。

 

web页面中查看程序执行结果如图6.8所示。

 

可以从图6.8中看到,现在显示的是HDFS中路径为:File:/user/hadoop/ KDDCUP_OUTPUT/ part-00000的文件。文件内容在图6.8的下方,文件中每一条的记录正是WordSearch程序在500万条记录中经过检索后得出的结果。输出的格式也正是上文中已经提到过的<[filename::offset], [searchWord]::[line]>

 

/***************************************************

* WordSearch V2.0

* 采用旧的Hadoop API

* by think_cxf 2011-04

* Input:<Offset,line>

* Output: <[filename]::[offset], [searchWord]::[line]>

* ***************************************************/

 

import Java.io.*;

import Java.util.*;

  

import org.apache.hadoop.fs.Path; 

import org.apache.hadoop.conf.*;

import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.*;

import org.apache.hadoop.util.*;

 

public class WordSearch extends Configured implements Tool

{

       /********************************

        * Map:

        * Input:  <offset,line>

        * Output: <[filename::Offset], [searchWord]::[line]>

        *********************************/

      public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text>

      {

              private Text              mapKey   = new Text(); //output in map function [key]

             private Text               mapValue = new Text(); //output in map function [value]

             private String             pattern      = "[^//w]";

              //正则表达式,代表不是0-9, a-z, A-Z的所有其它字符 

             private String     sWord;

              //需要查找的单词,用于传递需要搜索的单词给Reduce函数

             private String    temp;

             private String     FileName; //文件名

             private JobConf        conf;

             public void configure(JobConf conf) {

                 this.conf = conf;

             } 

 

             public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException

             {

                    String line = value.toString().toLowerCase(); //WORD --> word

                    line = line.replaceAll(pattern, " "); //将非0-9, a-z, A-Z的字符替换为空格

               

                    FileName = conf.get("map.input.file");//get the current file name

                    sWord = conf.get("searchWord");//get the search word

               

                    //int Offset = (int)key.get();

                    //IntWritable lineOffset = new IntWritable(Offset);//Offset

                    System.out.println("****sWord is:"+sWord+".****The Line is:"+line+'.');

                    System.out.println("FileName is:"+FileName+'.'+"/nThe Line is:"+line+'.');

                    StringTokenizer itr= new StringTokenizer(line);

                    while (itr.hasMoreTokens())

                    {

                           temp=itr.nextToken();

                           System.out.println("Temp word is:"+temp+'.');

                           if( temp.compareTo(sWord) == 0)

                            //比较两个单词,如果匹配,则写入键值对

                           {

                                  System.out.println("----Find one!----");

                                  mapKey.set('['+ FileName.toString() + "]::["+ key.toString() + "]:" );

                                   // become [filename]::[Offset]:

                                  mapValue.set('['+sWord.toString() + "] [" + line.toString() + "]");

                                   // become [searchWord] [line]

                                  output.collect(mapKey,mapValue);//产生<word,行号>这样的键值对

                           }//if end

                    }//while end

             }//public map end

       }// class Map end

 

             /************************************

            * Reduce Function

       * Input:<[filename::searchWord], [[searchWord] [line]1,[searchWord] [line]2,

       *[searchWord] [line]3,...,[searchWord] [line]n]>

            * Output:<[filename::searchWord],Offset>

            **************************************/

       public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text>

       {

public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException

             {

                     while (values.hasNext()) {

                            output.collect(key,values.next());

                    } //while end

             } //public reduce end

       } //public class Reduce end

  

      public int run(String[] args) throws Exception

      {

              JobConf conf = new JobConf(getConf(), WordSearch.class);

             conf.setJobName("wordsearch");   //set job name

             conf.set("searchWord", args[2]);    //send the "searchWord" to the system

             conf.setOutputKeyClass(Text.class);

             conf.setOutputValueClass(Text.class);

   

             conf.setMapperClass(Map.class);

             conf.setCombinerClass(Reduce.class);

             conf.setReducerClass(Reduce.class);

  

             conf.setInputFormat(TextInputFormat.class);

             conf.setOutputFormat(TextOutputFormat.class);

 

             FileInputFormat.setInputPaths(conf, new Path(args[0])  );

             FileOutputFormat.setOutputPath(conf, new Path(args[1]) );

             //setSearchWord(args[2]);

             JobClient.runJob(conf);

             return 0;

      }

  

      public static void main(String[] args) throws Exception

      {

             int exitCode = ToolRunner.run(new Configuration(), new WordSearch(), args);

             System.exit(exitCode);

       }

}

 

抱歉!评论已关闭.