1、讀取hdfs目錄:hadoop dfs -ls path相當於listStatus的簡寫
//checkpoint目錄是:/user/dmspark/accumulate/checkpoint
//e.g. /user/dmspark/accumulate/checkpoint/0519936a-5bff-4ecf-a6f0-3854e5952ec9/rdd-689/part-00099
private def getLatestCheckpoint(checkpointDir: String): Option[String] = {
val fs = FileSystem.get(new Configuration())
val latestCheckpointDir = findLatestSubDir(new Path(checkpointDir))
var latestCheckpointRDDDir = findLatestSubDir(latestCheckpointDir)
//如果最近的checkpoint文件不全,那麼就用前5分鐘的checkpoint數據
if(fs.listStatus(latestCheckpointRDDDir).length != NumPartitionsOfReducedRDD + 1)
latestCheckpointRDDDir = findLatestSubDir(latestCheckpointDir, 1)
if(latestCheckpointRDDDir != null) Some(latestCheckpointRDDDir.toString) else None
}
def findLatestSubDir(path: Path, index: Int = 0): Path = {
if(path == null || !fs.exists(path))
return null
val fileStatus = fs.listStatus(path).sortBy(_.getModificationTime).reverse
if(fileStatus.length <= index) null else fileStatus(index).getPath
}
2、讀取hdfs文件到內存的list
def readToList(file: Path): List[String] = {
val uRI = "hdfs://localhost:8021"
val configuration = new Configuration()
val hdfs: FileSystem = FileSystem.get(URI.create(uRI), configuration)
val in: FSDataInputStream = hdfs.open(file)
val reader: BufferedReader = new BufferedReader(new InputStreamReader(in, "UTF8"))
var line = ""
val list = new ListBuffer[String]
breakable(
while ((line = reader.readLine) != null) {
if (line == null) {
break()
}
list += line
}
)
list.toList
}
def main(args: Array[String]): Unit = {
val path = new Path("/user/dmspark/product3_source_conf/catecode_info.txt")
println(readToList(path).size)
}