Skip to content

Commit 636532e

Browse files
author
Wang, Gang(Gary)
committed
MNEMONIC-258: Implement the functionalities of DurableRDD's direct IO
1 parent a7b9458 commit 636532e

File tree

3 files changed

+102
-44
lines changed

3 files changed

+102
-44
lines changed
Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717

1818
package org.apache.mnemonic.spark
1919

20-
import org.apache.spark.rdd.RDD
2120
import org.apache.spark._
2221
import scala.reflect.ClassTag
2322
import scala.language.implicitConversions
@@ -27,12 +26,12 @@ import org.apache.mnemonic.spark.rdd.DurableRDD
2726

2827
class DurableSparkFunctions(sc: SparkContext) extends Serializable {
2928

30-
def mnemonic[D: ClassTag] (pathname: String,
29+
def mnemonic[D: ClassTag] (path: String,
3130
serviceName: String,
3231
durableTypes: Array[DurableType],
3332
entityFactoryProxies: Array[EntityFactoryProxy],
3433
slotKeyId: Long) = {
35-
DurableRDD[D](sc, pathname: String,
34+
DurableRDD[D](sc, path: String,
3635
serviceName, durableTypes, entityFactoryProxies, slotKeyId)
3736
}
3837
}

mnemonic-spark/mnemonic-spark-core/src/main/scala/org/apache/mnemonic/spark/rdd/DurableRDD.scala

Lines changed: 89 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,18 @@ import java.io.File
2222
import org.apache.spark.rdd.RDD
2323
import org.apache.spark._
2424
import org.apache.commons.io.FileUtils
25-
import scala.reflect.{ ClassTag }
25+
26+
import scala.reflect.ClassTag
2627
import scala.collection.JavaConverters._
2728
import scala.collection.mutable.ArrayBuffer
28-
import org.apache.mnemonic.DurableType
29-
import org.apache.mnemonic.EntityFactoryProxy
30-
import org.apache.mnemonic.NonVolatileMemAllocator
29+
import org.apache.mnemonic.{ConfigurationException, DurableType, EntityFactoryProxy, NonVolatileMemAllocator}
3130
import org.apache.mnemonic.sessions.ObjectCreator
3231
import org.apache.mnemonic.spark.MneDurableInputSession
3332
import org.apache.mnemonic.spark.MneDurableOutputSession
3433
import org.apache.mnemonic.spark.DurableException
3534

35+
import scala.collection.mutable
36+
3637
private[spark] class DurableRDD[D: ClassTag, T: ClassTag] (
3738
@transient private var _sc: SparkContext,
3839
@transient private var deps: Seq[Dependency[_]],
@@ -45,40 +46,47 @@ private[spark] class DurableRDD[D: ClassTag, T: ClassTag] (
4546

4647
private val isInputOnly = null == deps
4748

48-
private val durdddir = DurableRDD.getRddDirName(durableDirectory, id)
49-
DurableRDD.resetRddDir(durdddir)
50-
51-
override val partitioner = if (preservesPartitioning) firstParent[T].partitioner else None
49+
private var durdddir:String = _
50+
if (isInputOnly) {
51+
val dir = new File(durableDirectory)
52+
if (!dir.exists) {
53+
throw new ConfigurationException("Input directory does not exist")
54+
}
55+
durdddir = durableDirectory
56+
} else {
57+
durdddir = DurableRDD.getRddDirName(durableDirectory, id)
58+
DurableRDD.resetRddDir(durdddir)
59+
}
5260

53-
override protected def getPartitions: Array[Partition] = firstParent[T].partitions
61+
override val partitioner = if (!isInputOnly && preservesPartitioning) firstParent[T].partitioner else None
5462

55-
def prepareDurablePartition(split: Partition, context: TaskContext,
56-
iterator: Iterator[T]): Array[File] = {
57-
val outsess = MneDurableOutputSession[D](serviceName,
58-
durableTypes, entityFactoryProxies, slotKeyId,
59-
partitionPoolSize, durdddir.toString,
60-
DurableRDD.genDurableFileName(split.hashCode)_)
61-
try {
62-
for (item <- iterator) {
63-
f(item, outsess) match {
64-
case Some(res) => outsess.post(res)
65-
case None =>
66-
}
63+
override protected def getPartitions: Array[Partition] = {
64+
if (isInputOnly) {
65+
val ret = DurableRDD.collectMemPoolPartitionList(durdddir).getOrElse(Array[Partition]())
66+
if (ret.isEmpty) {
67+
logInfo(s"Not found any partitions in the directory ${durdddir}")
6768
}
68-
} finally {
69-
outsess.close()
69+
ret
70+
} else {
71+
firstParent[T].partitions
7072
}
71-
outsess.memPools.toArray
7273
}
7374

7475
override def compute(split: Partition, context: TaskContext): Iterator[D] = {
7576
val mempListOpt: Option[Array[File]] =
76-
DurableRDD.collectMemPoolFileList(durdddir.toString, DurableRDD.genDurableFileName(split.hashCode)_)
77+
DurableRDD.collectMemPoolFileList(durdddir, DurableRDD.genDurableFileName(context.partitionId)_)
7778
val memplist = mempListOpt match {
7879
case None => {
79-
val mplst = prepareDurablePartition(split, context, firstParent[T].iterator(split, context))
80-
logInfo(s"Done transformed RDD #${firstParent[T].id} to durableRDD #${id} on ${durdddir.toString}")
81-
mplst
80+
if (isInputOnly) {
81+
logInfo(s"Not found any mem pool files related to the partition #${context.partitionId}")
82+
Array[File]()
83+
} else {
84+
val mplst = DurableRDD.prepareDurablePartition[D, T](durdddir,
85+
serviceName, durableTypes, entityFactoryProxies, slotKeyId,
86+
partitionPoolSize, f)(context, firstParent[T].iterator(split, context))
87+
logInfo(s"Done transformed RDD #${firstParent[T].id} to durableRDD #${id} on ${durdddir}")
88+
mplst
89+
}
8290
}
8391
case Some(mplst) => mplst
8492
}
@@ -104,6 +112,7 @@ object DurableRDD {
104112

105113
val durableSubDirNameTemplate = "durable-rdd-%010d"
106114
val durableFileNameTemplate = "mem_%010d_%010d.mne"
115+
val durableFileNamePartitionRegex = raw"mem_(\d{10})_0000000000.mne".r
107116

108117
private var durableDir: Option[String] = None
109118

@@ -146,23 +155,40 @@ object DurableRDD {
146155
}
147156

148157
def createRddDir(rddDirName: String) {
149-
val durdddir = new File(rddDirName)
150-
if (!durdddir.mkdir) {
151-
throw new DurableException(s"Durable RDD directory ${durdddir.toString} cannot be created")
158+
val dir = new File(rddDirName)
159+
if (!dir.mkdir) {
160+
throw new DurableException(s"Durable RDD directory ${dir.toString} cannot be created")
152161
}
153162
}
154163

155164
def deleteRddDir(rddDirName: String) {
156-
val durdddir = new File(rddDirName)
157-
if (durdddir.exists) {
158-
FileUtils.deleteDirectory(durdddir)
165+
val dir = new File(rddDirName)
166+
if (dir.exists) {
167+
FileUtils.deleteDirectory(dir)
159168
}
160169
}
161170

162-
def genDurableFileName(splitId: Int)(mempidx: Long): String = {
171+
def genDurableFileName(splitId: Long)(mempidx: Long): String = {
163172
durableFileNameTemplate.format(splitId, mempidx)
164173
}
165174

175+
def collectMemPoolPartitionList(path: String): Option[Array[Partition]] = {
176+
val paridset = new mutable.TreeSet[Int]
177+
val dir = new File(path)
178+
if (dir.isDirectory) {
179+
val flst = dir.listFiles.filter(_.isDirectory)
180+
for (file <- flst) {
181+
file.toString match {
182+
case durableFileNamePartitionRegex(paridx) => {
183+
paridset += paridx.toInt
184+
}
185+
case _ =>
186+
}
187+
}
188+
}
189+
Option(paridset.toArray.map(x => new Partition { val index = x }))
190+
}
191+
166192
def collectMemPoolFileList(durddir: String, memFileNameGen: (Long)=>String): Option[Array[File]] = {
167193
val flist: ArrayBuffer[File] = new ArrayBuffer[File]
168194
var idx: Long = 0L
@@ -184,28 +210,52 @@ object DurableRDD {
184210
}
185211
}
186212

213+
def prepareDurablePartition[D: ClassTag, T: ClassTag] (path: String,
214+
serviceName: String, durableTypes: Array[DurableType],
215+
entityFactoryProxies: Array[EntityFactoryProxy], slotKeyId: Long,
216+
partitionPoolSize: Long,
217+
func: (T, ObjectCreator[D, NonVolatileMemAllocator]) => Option[D]
218+
) (context: TaskContext, iterator: Iterator[T]): Array[File] = {
219+
val outsess = MneDurableOutputSession[D](serviceName,
220+
durableTypes, entityFactoryProxies, slotKeyId,
221+
partitionPoolSize, path,
222+
genDurableFileName(context.partitionId)_)
223+
try {
224+
for (item <- iterator) {
225+
func(item, outsess) match {
226+
case Some(res) => outsess.post(res)
227+
case None =>
228+
}
229+
}
230+
} finally {
231+
outsess.close()
232+
}
233+
outsess.memPools.toArray
234+
}
235+
187236
def apply[D: ClassTag, T: ClassTag] (
188237
rdd: RDD[T],
189238
serviceName: String, durableTypes: Array[DurableType],
190239
entityFactoryProxies: Array[EntityFactoryProxy], slotKeyId: Long,
191240
partitionPoolSize: Long,
192241
f: (T, ObjectCreator[D, NonVolatileMemAllocator]) => Option[D],
193242
preservesPartitioning: Boolean = false) = {
194-
// val sc: SparkContext = rdd.context
243+
val sc: SparkContext = rdd.context
244+
val cleanF = f // sc.clean(f)
195245
val ret = new DurableRDD[D, T](rdd.context , List(new OneToOneDependency(rdd)),
196246
serviceName, durableTypes, entityFactoryProxies, slotKeyId,
197-
partitionPoolSize, getDurableDir(rdd.context).get, f, preservesPartitioning)
247+
partitionPoolSize, getDurableDir(sc).get, cleanF, preservesPartitioning)
198248
//sc.cleaner.foreach(_.registerRDDForCleanup(ret))
199249
ret
200250
}
201251

202252
def apply[D: ClassTag] (
203-
sc: SparkContext, pathname: String,
253+
sc: SparkContext, path: String,
204254
serviceName: String, durableTypes: Array[DurableType],
205255
entityFactoryProxies: Array[EntityFactoryProxy], slotKeyId: Long) = {
206256
val ret = new DurableRDD[D, Unit](sc, null,
207257
serviceName, durableTypes, entityFactoryProxies, slotKeyId,
208-
1024*1024*1024L, pathname, null)
258+
1024*1024*1024L, path, null)
209259
ret
210260
}
211261

mnemonic-spark/mnemonic-spark-core/src/main/scala/org/apache/mnemonic/spark/rdd/DurableRDDFunctions.scala

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
package org.apache.mnemonic.spark.rdd
1919

20+
import java.io.File
2021
import org.apache.spark.rdd.RDD
2122
import scala.reflect.ClassTag
2223
import scala.language.implicitConversions
@@ -40,14 +41,22 @@ class DurableRDDFunctions[T: ClassTag](rdd: RDD[T]) extends Serializable {
4041
partitionPoolSize, f, preservesPartitioning)
4142
}
4243

43-
def saveAsMnemonic[D: ClassTag] (dir: String,
44+
def saveAsMnemonic[D: ClassTag] (path: String,
4445
serviceName: String,
4546
durableTypes: Array[DurableType],
4647
entityFactoryProxies: Array[EntityFactoryProxy],
4748
slotKeyId: Long,
4849
partitionPoolSize: Long,
4950
f: (T, ObjectCreator[D, NonVolatileMemAllocator]) => Option[D]) {
50-
//TODO: implement export operationl
51+
val dir = new File(path)
52+
if (!dir.exists) {
53+
dir.mkdir
54+
}
55+
val cleanF = f // rdd.context.clean(f)
56+
val func = DurableRDD.prepareDurablePartition[D, T] (path,
57+
serviceName, durableTypes, entityFactoryProxies, slotKeyId,
58+
partitionPoolSize, cleanF)_
59+
rdd.context.runJob(rdd, func)
5160
}
5261
}
5362

0 commit comments

Comments
 (0)