scala操作hdfs

1、讀取hdfs目錄:hadoop dfs -ls path相當於listStatus的簡寫

//checkpoint目錄是:/user/dmspark/accumulate/checkpoint
//e.g. /user/dmspark/accumulate/checkpoint/0519936a-5bff-4ecf-a6f0-3854e5952ec9/rdd-689/part-00099

private def getLatestCheckpoint(checkpointDir: String): Option[String] = {
    val fs = FileSystem.get(new Configuration())
    val latestCheckpointDir = findLatestSubDir(new Path(checkpointDir))
    var latestCheckpointRDDDir = findLatestSubDir(latestCheckpointDir)
    //如果最近的checkpoint文件不全,那麼就用前5分鐘的checkpoint數據
    if(fs.listStatus(latestCheckpointRDDDir).length != NumPartitionsOfReducedRDD + 1)
      latestCheckpointRDDDir = findLatestSubDir(latestCheckpointDir, 1)
    if(latestCheckpointRDDDir != null) Some(latestCheckpointRDDDir.toString) else None
}

def findLatestSubDir(path: Path, index: Int = 0): Path = {
   if(path == null || !fs.exists(path))
     return null
   val fileStatus = fs.listStatus(path).sortBy(_.getModificationTime).reverse
   if(fileStatus.length <= index) null else fileStatus(index).getPath
}

2、讀取hdfs文件到內存的list

  def readToList(file: Path): List[String] = {
    val uRI = "hdfs://localhost:8021"
    val configuration = new Configuration()
    val hdfs: FileSystem = FileSystem.get(URI.create(uRI), configuration)
    val in: FSDataInputStream = hdfs.open(file)
    val reader: BufferedReader = new BufferedReader(new InputStreamReader(in, "UTF8"))
    var line = ""
    val list = new ListBuffer[String]

    breakable(
      while ((line = reader.readLine) != null) {
        if (line == null) {
          break()
        }
        list += line
      }
    )
    list.toList
  }
 

  def main(args: Array[String]): Unit = {
    val path = new Path("/user/dmspark/product3_source_conf/catecode_info.txt")
    println(readToList(path).size)
  }

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章