使用MapReduce程序对KDD Cup 99数据集进行信息检索(二)

现在的位置: 首页 > 综合 > 正文

使用MapReduce程序对KDD Cup 99数据集进行信息检索(二)

2013年09月17日 ⁄ 综合 ⁄ 共 4932字 ⁄ 字号小中大 ⁄ 评论关闭

不知道怎么回事，刚才写好的文章可能字数太多？还是图片太多？导致我文章的后半部分没有了。所以接着写第二篇吧。

从web页面中查看程序执行结果如图6.8所示。

可以从图6.8中看到，现在显示的是HDFS中路径为：File:/user/hadoop/ KDDCUP_OUTPUT/ part-00000的文件。文件内容在图6.8的下方，文件中每一条的记录正是WordSearch程序在500万条记录中经过检索后得出的结果。输出的格式也正是上文中已经提到过的<[filename::offset], [searchWord]::[line]>。

/***************************************************

* WordSearch V2.0

* 采用旧的Hadoop API

* by think_cxf 2011-04

* Input:<Offset,line>

* Output: <[filename]::[offset], [searchWord]::[line]>

* ***************************************************/

import Java.io.*;

import Java.util.*;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;

import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.*;

import org.apache.hadoop.util.*;

public class WordSearch extends Configured implements Tool

{

/********************************

* Map:

* Input: <offset,line>

* Output: <[filename::Offset], [searchWord]::[line]>

*********************************/

public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text>

{

private Text mapKey = new Text(); //output in map function [key]

private Text mapValue = new Text(); //output in map function [value]

private String pattern = "[^//w]";

//正则表达式，代表不是0-9, a-z, A-Z的所有其它字符

private String sWord;

//需要查找的单词，用于传递需要搜索的单词给Reduce函数

private String temp;

private String FileName; //文件名

private JobConf conf;

public void configure(JobConf conf) {

this.conf = conf;

}

public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException

{

String line = value.toString().toLowerCase(); //WORD --> word

line = line.replaceAll(pattern, " "); //将非0-9, a-z, A-Z的字符替换为空格

FileName = conf.get("map.input.file");//get the current file name

sWord = conf.get("searchWord");//get the search word

//int Offset = (int)key.get();

//IntWritable lineOffset = new IntWritable(Offset);//Offset

System.out.println("****sWord is:"+sWord+".****The Line is:"+line+'.');

System.out.println("FileName is:"+FileName+'.'+"/nThe Line is:"+line+'.');

StringTokenizer itr= new StringTokenizer(line);

while (itr.hasMoreTokens())

{

temp=itr.nextToken();

System.out.println("Temp word is:"+temp+'.');

if( temp.compareTo(sWord) == 0)

//比较两个单词,如果匹配，则写入键值对

{

System.out.println("----Find one!----");

mapKey.set('['+ FileName.toString() + "]::["+ key.toString() + "]:" );

// become [filename]::[Offset]:

mapValue.set('['+sWord.toString() + "] [" + line.toString() + "]");

// become [searchWord] [line]

output.collect(mapKey,mapValue);//产生<word,行号>这样的键值对

}//if end

}//while end

}//public map end

}// class Map end

/************************************

* Reduce Function

* Input:<[filename::searchWord], [[searchWord] [line]1,[searchWord] [line]2,

*[searchWord] [line]3,...,[searchWord] [line]n]>

* Output:<[filename::searchWord],Offset>

**************************************/

public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text>

{

public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException

{

while (values.hasNext()) {

output.collect(key,values.next());

} //while end

} //public reduce end

} //public class Reduce end

public int run(String[] args) throws Exception

{

JobConf conf = new JobConf(getConf(), WordSearch.class);

conf.setJobName("wordsearch"); //set job name

conf.set("searchWord", args[2]); //send the "searchWord" to the system

conf.setOutputKeyClass(Text.class);

conf.setOutputValueClass(Text.class);

conf.setMapperClass(Map.class);

conf.setCombinerClass(Reduce.class);

conf.setReducerClass(Reduce.class);

conf.setInputFormat(TextInputFormat.class);

conf.setOutputFormat(TextOutputFormat.class);

FileInputFormat.setInputPaths(conf, new Path(args[0]) );

FileOutputFormat.setOutputPath(conf, new Path(args[1]) );

//setSearchWord(args[2]);

JobClient.runJob(conf);

return 0;

}

public static void main(String[] args) throws Exception

{

int exitCode = ToolRunner.run(new Configuration(), new WordSearch(), args);

System.exit(exitCode);

}

【上篇】让自己写的子窗口可以被父对话框的TAB键切换，像一个控件一样
【下篇】CEGUI官方资料翻译之【六】

作者: minx

该日志由 minx 于11年前发表在综合分类下，最后更新于 2013年09月17日.
转载请注明: 使用MapReduce程序对KDD Cup 99数据集进行信息检索(二) | 学步园 +复制链接

抱歉!评论已关闭.

学步园

使用MapReduce程序对KDD Cup 99数据集进行信息检索(二)

作者: minx

书签

最新文章New

本站推荐

返回首页