- * Use {@link #restoreSequenceFile(String, JavaSparkContext)} to restore values saved with this method.
- *
- * @param path Path to save the sequence file
- * @param rdd RDD to save
- * @see #saveSequenceFileSequences(String, JavaRDD)
- * @see #saveMapFile(String, JavaRDD)
- */
- public static void saveSequenceFile(String path, JavaRDD
- * Use {@link #restoreSequenceFile(String, JavaSparkContext)} to restore values saved with this method.
- *
- * @param path Path to save the sequence file
- * @param rdd RDD to save
- * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
- * to limit the maximum number of output sequence files
- * @see #saveSequenceFileSequences(String, JavaRDD)
- * @see #saveMapFile(String, JavaRDD)
- */
- public static void saveSequenceFile(String path, JavaRDD
- * Use {@link #restoreSequenceFileSequences(String, JavaSparkContext)} to restore values saved with this method.
- *
- * @param path Path to save the sequence file
- * @param rdd RDD to save
- * @see #saveSequenceFile(String, JavaRDD)
- * @see #saveMapFileSequences(String, JavaRDD)
- */
- public static void saveSequenceFileSequences(String path, JavaRDD
- * Use {@link #restoreSequenceFileSequences(String, JavaSparkContext)} to restore values saved with this method.
- *
- * @param path Path to save the sequence file
- * @param rdd RDD to save
- * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
- * to limit the maximum number of output sequence files
- * @see #saveSequenceFile(String, JavaRDD)
- * @see #saveMapFileSequences(String, JavaRDD)
- */
- public static void saveSequenceFileSequences(String path, JavaRDD
- * Use {@link #restoreMapFile(String, JavaSparkContext)} to restore values saved with this method.
- *
- * @param path Path to save the MapFile
- * @param rdd RDD to save
- * @see #saveMapFileSequences(String, JavaRDD)
- * @see #saveSequenceFile(String, JavaRDD)
- */
- public static void saveMapFile(String path, JavaRDD
- * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method.
- *
- * @param path Path to save the MapFile
- * @param rdd RDD to save
- * @param interval The map file index interval to use. Smaller values may result in the faster look up, at the
- * expense of more memory/disk use. However, usually the increase is relatively minor, due to
- * keys being stored as LongWritable objects
- * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
- * to limit the maximum number of output map files
- * @see #saveMapFileSequences(String, JavaRDD)
- * @see #saveSequenceFile(String, JavaRDD)
- */
- public static void saveMapFile(String path, JavaRDD
- * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method.
- *
- * @param path Path to save the MapFile
- * @param rdd RDD to save
- * @param c Configuration object, used to customise options for the map file
- * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
- * to limit the maximum number of output map files
- * @see #saveMapFileSequences(String, JavaRDD)
- * @see #saveSequenceFile(String, JavaRDD)
- */
- public static void saveMapFile(String path, JavaRDD> rdd) {
- saveSequenceFile(path, rdd, null);
- }
-
- /**
- * Save a {@code JavaRDD
>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record is given
- * a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link RecordWritable} instances.
- *
> rdd, Integer maxOutputFiles) {
- path = FilenameUtils.normalize(path, true);
- if (maxOutputFiles != null) {
- rdd = rdd.coalesce(maxOutputFiles);
- }
- JavaPairRDD
, Long> dataIndexPairs = rdd.zipWithUniqueId(); //Note: Long values are unique + NOT contiguous; more efficient than zipWithIndex
- JavaPairRDD
>} previously saved with {@link #saveSequenceFile(String, JavaRDD)}
- *
- * @param path Path of the sequence file
- * @param sc Spark context
- * @return The restored RDD
- */
- public static JavaRDD
> restoreSequenceFile(String path, JavaSparkContext sc) {
- return restoreMapFile(path, sc).values();
- }
-
- /**
- * Save a {@code JavaRDD
>>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record
- * is given a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link SequenceRecordWritable} instances.
- *
>> rdd) {
- saveSequenceFileSequences(path, rdd, null);
- }
-
- /**
- * Save a {@code JavaRDD
>>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record
- * is given a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link SequenceRecordWritable} instances.
- *
>> rdd,
- Integer maxOutputFiles) {
- path = FilenameUtils.normalize(path, true);
- if (maxOutputFiles != null) {
- rdd = rdd.coalesce(maxOutputFiles);
- }
- JavaPairRDD
>, Long> dataIndexPairs = rdd.zipWithUniqueId(); //Note: Long values are unique + NOT contiguous; more efficient than zipWithIndex
- JavaPairRDD
>} previously saved with {@link #saveSequenceFileSequences(String, JavaRDD)}
- *
- * @param path Path of the sequence file
- * @param sc Spark context
- * @return The restored RDD
- */
- public static JavaRDD
>> restoreSequenceFileSequences(String path, JavaSparkContext sc) {
- return restoreMapFileSequences(path, sc).values();
- }
-
-
- /**
- * Save a {@code JavaRDD
>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is
- * given a unique and contiguous {@link LongWritable} key, and values are stored as
- * {@link RecordWritable} instances.
- * Note 1: If contiguous keys are not required, using a sequence file instead is preferable from a performance
- * point of view. Contiguous keys are often only required for non-Spark use cases, such as with
- * {@link org.datavec.hadoop.records.reader.mapfile.MapFileRecordReader}
- * Note 2: This use a MapFile interval of {@link #DEFAULT_MAP_FILE_INTERVAL}, which is usually suitable for
- * use cases such as {@link org.datavec.hadoop.records.reader.mapfile.MapFileRecordReader}. Use
- * {@link #saveMapFile(String, JavaRDD, int, Integer)} or {@link #saveMapFile(String, JavaRDD, Configuration, Integer)}
- * to customize this.
- * > rdd) {
- saveMapFile(path, rdd, DEFAULT_MAP_FILE_INTERVAL, null);
- }
-
- /**
- * Save a {@code JavaRDD
>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is
- * given a unique and contiguous {@link LongWritable} key, and values are stored as
- * {@link RecordWritable} instances.
- * Note: If contiguous keys are not required, using a sequence file instead is preferable from a performance
- * point of view. Contiguous keys are often only required for non-Spark use cases, such as with
- * {@link org.datavec.hadoop.records.reader.mapfile.MapFileRecordReader}
- * > rdd, int interval,
- Integer maxOutputFiles) {
- Configuration c = new Configuration();
- c.set(MAP_FILE_INDEX_INTERVAL_KEY, String.valueOf(interval));
- saveMapFile(path, rdd, c, maxOutputFiles);
- }
-
- /**
- * Save a {@code JavaRDD
>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is
- * given a unique and contiguous {@link LongWritable} key, and values are stored as
- * {@link RecordWritable} instances.
- * Note: If contiguous keys are not required, using a sequence file instead is preferable from a performance
- * point of view. Contiguous keys are often only required for non-Spark use cases, such as with
- * {@link org.datavec.hadoop.records.reader.mapfile.MapFileRecordReader}
- *