/** * The mapping method. Takes an element from the input data set and transforms it into exactly * one element. * * @param 输入元素的数据类型. * @return 输出元素的数据类型. * @throws Exception This method may throw exceptions. Throwing an exception will cause the * operation to fail and may trigger recovery. */ O map(T value)throws Exception; }
/** * The core method of the FlatMapFunction. Takes an element from the input data set and * transforms it into zero, one, or more elements. * * @param 输入元素的数据类型. * @param out 返回元素收集器 * @throws Exception This method may throw exceptions. Throwing an exception will cause the * operation to fail and may trigger recovery. */ voidflatMap(T value, Collector<O> out)throws Exception; }
/** * The filter function that evaluates the predicate. * * <p><strong>IMPORTANT:</strong> The system assumes that the function does not modify the * elements on which the predicate is applied. Violating this assumption can lead to incorrect * results. * * @param value 输入元素的数据类型 * @return 返回值如果为True该元素就保留,返回值如果为False该就过滤掉 * @throws Exception This method may throw exceptions. Throwing an exception will cause the * operation to fail and may trigger recovery. */ booleanfilter(T value)throws Exception; }
/** * @param value The object to get the key from. * @return The extracted key. * @throws Exception Throwing an exception will cause the execution of the respective task to * fail, and trigger recovery or cancellation of the program. */ KEY getKey(IN value)throws Exception; }
/** * The core method of ReduceFunction, combining two values into one value of the same type. The * reduce function is consecutively applied to all values of a group until only a single value * remains. * * @param value1 The first value to combine. * @param value2 The second value to combine. * @return The combined value of both input values. * @throws Exception This method may throw exceptions. Throwing an exception will cause the * operation to fail and may trigger recovery. */ T reduce(T value1, T value2)throws Exception; }
/** * Partitioner that sends all elements to the downstream operator with subtask ID=0. * 分区器将所有上游元素发送到下游ID=0(第一个)的算子中 * @param <T> Type of the elements in the Stream being partitioned */ @Internal publicclassGlobalPartitioner<T> extendsStreamPartitioner<T> { privatestaticfinallongserialVersionUID=1L;
/** * Sets the partitioning of the {@link DataStream} so that the output elements are forwarded to * the local subtask of the next operation. * * @return The DataStream with forward partitioning set. */ public DataStream<T> forward() { return setConnectionType(newForwardPartitioner<T>()); }
/** * Partitioner that forwards elements only to the locally running downstream operation. * 分区器将元素发送至本地运行的下游算子,上游与下游并行度必须一致 * @param <T> Type of the elements in the Stream */ @Internal publicclassForwardPartitioner<T> extendsStreamPartitioner<T> { privatestaticfinallongserialVersionUID=1L;
@Override public String toString() { return"FORWARD"; }
@Override public SubtaskStateMapper getDownstreamSubtaskStateMapper() { return SubtaskStateMapper.UNSUPPORTED; }
@Override public SubtaskStateMapper getUpstreamSubtaskStateMapper() { return SubtaskStateMapper.UNSUPPORTED; } }
BROADCAST分区
广播分区将上游数据集输出到下游Operator的每个实例中。适合于大数据集Join小数据集的场景。
1 2 3 4 5 6 7 8 9
/** * Sets the partitioning of the {@link DataStream} so that the output elements are broadcasted * to every parallel instance of the next operation. * 将上游所有元素复制多份发送至下游的所有算子中 * @return The DataStream with broadcast partitioning set. */ public DataStream<T> broadcast() { return setConnectionType(newBroadcastPartitioner<T>()); }
/** * Partitioner that selects all the output channels. * * @param <T> Type of the elements in the Stream being broadcast */ @Internal publicclassBroadcastPartitioner<T> extendsStreamPartitioner<T> { privatestaticfinallongserialVersionUID=1L;
/** * Note: Broadcast mode could be handled directly for all the output channels in record writer, * so it is no need to select channels via this method. */ @Override publicintselectChannel(SerializationDelegate<StreamRecord<T>> record) { thrownewUnsupportedOperationException( "Broadcast partitioner does not support select channels."); }
@Override public SubtaskStateMapper getUpstreamSubtaskStateMapper() { return SubtaskStateMapper.UNSUPPORTED; }
@Override public SubtaskStateMapper getDownstreamSubtaskStateMapper() { return SubtaskStateMapper.UNSUPPORTED; }
/** * Sets the partitioning of the {@link DataStream} so that the output elements are shuffled * uniformly randomly to the next operation. * 将上游算子中的所有元素随机发送至下游算子中 * @return The DataStream with shuffle partitioning set. */ @PublicEvolving public DataStream<T> shuffle() { return setConnectionType(newShufflePartitioner<T>()); }
/** * Partitioner that distributes the data equally by selecting one output channel randomly. * * @param <T> Type of the Tuple */ @Internal publicclassShufflePartitioner<T> extendsStreamPartitioner<T> { privatestaticfinallongserialVersionUID=1L;
/** * Sets the partitioning of the {@link DataStream} so that the output elements are distributed * evenly to a subset of instances of the next operation in a round-robin fashion. * * <p>The subset of downstream operations to which the upstream operation sends elements depends * on the degree of parallelism of both the upstream and downstream operation. For example, if * the upstream operation has parallelism 2 and the downstream operation has parallelism 4, then * one upstream operation would distribute elements to two downstream operations while the other * upstream operation would distribute to the other two downstream operations. If, on the other * hand, the downstream operation has parallelism 2 while the upstream operation has parallelism * 4 then two upstream operations will distribute to one downstream operation while the other * two upstream operations will distribute to the other downstream operations. * * <p>In cases where the different parallelisms are not multiples of each other one or several * downstream operations will have a differing number of inputs from upstream operations. * * @return The DataStream with rescale partitioning set. */ @PublicEvolving public DataStream<T> rescale() { return setConnectionType(newRescalePartitioner<T>()); }
/** * Partitioner that distributes the data equally by cycling through the output channels. This * distributes only to a subset of downstream nodes because {@link * org.apache.flink.streaming.api.graph.StreamingJobGraphGenerator} instantiates a {@link * DistributionPattern#POINTWISE} distribution pattern when encountering {@code RescalePartitioner}. * * <p>The subset of downstream operations to which the upstream operation sends elements depends on * the degree of parallelism of both the upstream and downstream operation. For example, if the * upstream operation has parallelism 2 and the downstream operation has parallelism 4, then one * upstream operation would distribute elements to two downstream operations while the other * upstream operation would distribute to the other two downstream operations. If, on the other * hand, the downstream operation has parallelism 2 while the upstream operation has parallelism 4 * then two upstream operations will distribute to one downstream operation while the other two * upstream operations will distribute to the other downstream operations. * * <p>In cases where the different parallelisms are not multiples of each other one or several * downstream operations will have a differing number of inputs from upstream operations. * * @param <T> Type of the elements in the Stream being rescaled */ @Internal publicclassRescalePartitioner<T> extendsStreamPartitioner<T> { privatestaticfinallongserialVersionUID=1L;
/** * Partitions a DataStream on the key returned by the selector, using a custom partitioner. This * method takes the key selector to get the key to partition on, and a partitioner that accepts * the key type. * * <p>Note: This method works only on single field keys, i.e. the selector cannot return tuples * of fields. * * @param partitioner The partitioner to assign partitions to keys. * @param keySelector The KeySelector with which the DataStream is partitioned. * @return The partitioned DataStream. * @see KeySelector */ public <K> DataStream<T> partitionCustom( Partitioner<K> partitioner, KeySelector<T, K> keySelector) { return setConnectionType( newCustomPartitionerWrapper<>(clean(partitioner), clean(keySelector))); }