现在的位置: 首页 > 综合 > 正文

hadoop中文件的通配globStatus用法

2014年10月06日 ⁄ 综合 ⁄ 共 3112字 ⁄ 字号 评论关闭
 /**
   * 列出给定的Path(含通配符)对应的文件/目录,根据第二个给定的过滤器进行过滤
   * <p>Return all the files that match filePattern and are not checksum
   * files. Results are sorted by their names.
   * 
   * <p>
   * A filename pattern is composed of <i>regular</i> characters and
   * <i>special pattern matching</i> characters, which are:
   *
   * <dl>
   *  <dd>
   *   <dl>
   *    <p>
   *    <dt> <tt> ? </tt>
   *    <dd> Matches any single character.
   *
   *    <p>
   *    <dt> <tt> * </tt>
   *    <dd> Matches zero or more characters.
   *
   *    <p>
   *    <dt> <tt> [<i>abc</i>] </tt>
   *    <dd> Matches a single character from character set
   *     <tt>{<i>a,b,c</i>}</tt>.
   *
   *    <p>
   *    <dt> <tt> [<i>a</i>-<i>b</i>] </tt>
   *    <dd> Matches a single character from the character range
   *     <tt>{<i>a...b</i>}</tt>.  Note that character <tt><i>a</i></tt> must be
   *     lexicographically less than or equal to character <tt><i>b</i></tt>.
   *
   *    <p>
   *    <dt> <tt> [^<i>a</i>] </tt>
   *    <dd> Matches a single character that is not from character set or range
   *     <tt>{<i>a</i>}</tt>.  Note that the <tt>^</tt> character must occur
   *     immediately to the right of the opening bracket.
   *
   *    <p>
   *    <dt> <tt> \<i>c</i> </tt>
   *    <dd> Removes (escapes) any special meaning of character <i>c</i>.
   *
   *    <p>
   *    <dt> <tt> {ab,cd} </tt>
   *    <dd> Matches a string from the string set <tt>{<i>ab, cd</i>} </tt>
   *    
   *    <p>
   *    <dt> <tt> {ab,c{de,fh}} </tt>
   *    <dd> Matches a string from the string set <tt>{<i>ab, cde, cfh</i>}</tt>
   *
   *   </dl>
   *  </dd>
   * </dl>
   * 
   * Return an array of FileStatus objects whose path names match pathPattern
   * and is accepted by the user-supplied path filter. Results are sorted by
   * their path names.
   * Return null if pathPattern has no glob and the path does not exist.
   * Return an empty array if pathPattern has a glob and no path matches it. 
   * 
   * @param pathPattern
   *          a regular expression specifying the path pattern
   * @param filter
   *          a user-supplied path filter
   * @return an array of FileStatus objects
   * @throws IOException if any I/O error occurs when fetching file status
   */
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
class RegexExcludePathFilter implements PathFilter{
	private final String regex;
	public RegexExcludePathFilter(String regex)
	{
		this.regex=regex;
	}
	public boolean accept(Path path)
	{
		return !path.toString().matches(regex);
	}
}

public class ListStatus {
	public static void main(String[] args)throws Exception{
		Configuration conf=new Configuration();
		String uri=args[0];
		FileSystem fs =FileSystem.get(URI.create(uri),conf);//获得文件系统的一个实例
		Path[] paths=new Path[args.length];
		for(int i=0;i<paths.length;++i)
		{
			paths[i]=new Path(args[i]);
		}
		FileStatus[] status=fs.listStatus(paths);//列出paths目录下的内容,传Filestats
		Path[]listedPaths=FileUtil.stat2Paths(status);//将status转为path,以便输出查找到的结果
		for(Path p:listedPaths)
		{
			System.out.println(p);
		}
		System.out.println("====================================================");
		//筛选
		FileStatus[]stt=fs.globStatus(new Path("hdfs://localhost:9000/user/*/input/*"),new RegexExcludePathFilter("hdfs://localhost:9000/user/root/input/file[023]{2}"));
		//注:globStatus从第一个参数通配符合到文件,剔除满足第二个参数到结果,因为PathFilter中accept是return!
		Path[]pa1=FileUtil.stat2Paths(stt);
		for(Path p:pa1)
		{
			System.out.println(p);
		}
	}
}

抱歉!评论已关闭.