Skip to content

Commit

Permalink
docs(samples): Add Dataflow "Getting started" flex template (#8897)
Browse files Browse the repository at this point in the history
* Add a basic flex template sample, for the Dataflow flex templates tutorial.
* Use unique image tag in test
  • Loading branch information
VeronicaWasson authored Dec 7, 2023
1 parent 27ee41e commit dc18ac3
Show file tree
Hide file tree
Showing 5 changed files with 512 additions and 0 deletions.
70 changes: 70 additions & 0 deletions dataflow/flex-templates/getting_started/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Dataflow flex template: Getting started sample

## Before you begin

Make sure you have followed the
[Dataflow setup instructions](../../README.md).

## Create a Cloud Storage bucket

```sh
export BUCKET="your-bucket"
gcloud storage buckets create gs://$BUCKET
```

## Create an Artifact Registry repository

```sh
export REGION="us-central1"
export REPOSITORY="your-repository"

gcloud artifacts repositories create $REPOSITORY \
--repository-format=docker \
--location=$REGION
```

## Build the JAR file

```sh
mvn clean package
```

## Build the template

```sh
export PROJECT="project-id"

gcloud dataflow flex-template build gs://$BUCKET/getting_started_java.json \
--image-gcr-path "$REGION-docker.pkg.dev/$PROJECT/$REPOSITORY/getting-started-java:latest" \
--sdk-language "JAVA" \
--flex-template-base-image JAVA11 \
--metadata-file "metadata.json" \
--jar "target/flex-template-getting-started-1.0.jar" \
--env FLEX_TEMPLATE_JAVA_MAIN_CLASS="com.example.dataflow.FlexTemplateGettingStarted"
```

## Run the template

```sh

gcloud dataflow flex-template run "flex-`date +%Y%m%d-%H%M%S`" \
--template-file-gcs-location "gs://$BUCKET/getting_started_java.json" \
--region $REGION \
--parameters output="gs://$BUCKET/output-"
```

## Clean up

To delete the resources that you created:

```sh
gcloud artifacts repositories delete $REPOSITORY --location $REGION --quiet
gcloud storage rm gs://$BUCKET --recursive
```


## What's next?

For more information about building and running flex templates, see
📝 [Use Flex Templates](https://cloud.google.com/dataflow/docs/guides/templates/using-flex-templates).

14 changes: 14 additions & 0 deletions dataflow/flex-templates/getting_started/metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"name": "Getting started Java flex template",
"description": "An example flex template for Java.",
"parameters": [
{
"name": "output",
"label": "Output destination",
"helpText": "The path and filename prefix for writing output files.",
"regexes": [
"^gs:\\/\\/[^\\n\\r]+$"
]
}
]
}
204 changes: 204 additions & 0 deletions dataflow/flex-templates/getting_started/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>com.google.cloud.samples</groupId>
<artifactId>shared-configuration</artifactId>
<version>1.2.0</version>
</parent>

<groupId>com.example.dataflow</groupId>
<artifactId>flex-template-getting-started</artifactId>
<version>1.0</version>

<properties>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<beam.version>2.49.0</beam.version>

<maven-enforcer-plugin.version>3.4.1</maven-enforcer-plugin.version>
<maven-compiler-plugin.version>3.11.0</maven-compiler-plugin.version>
<maven-shade-plugin.version>3.5.0</maven-shade-plugin.version>
<maven-exec-plugin.version>3.1.0</maven-exec-plugin.version>
<slf4j.version>2.0.8</slf4j.version>
</properties>

<repositories>
<repository>
<id>apache.snapshots</id>
<name>Apache Development Snapshot Repository</name>
<url>https://repository.apache.org/content/repositories/snapshots/</url>
<releases>
<enabled>false</enabled>
</releases>
</repository>
</repositories>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-enforcer-plugin</artifactId>
<version>${maven-enforcer-plugin.version}</version>
<executions>
<execution>
<id>enforce-maven</id>
<goals>
<goal>enforce</goal>
</goals>
<configuration>
<rules>
<requireMavenVersion>
<version>3.0.5</version>
</requireMavenVersion>
</rules>
</configuration>
</execution>
</executions>
</plugin>

<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>${maven-compiler-plugin.version}</version>
</plugin>

<!-- The maven shade plugin is used to create an uber-jar with all the
dependencies needed to run as a standalone jar.
Do not minimize the jar since that removes some of the required
classes for the runners. -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>${maven-shade-plugin.version}</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>

<pluginManagement>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>${maven-exec-plugin.version}</version>
<configuration>
<cleanupDaemonThreads>false</cleanupDaemonThreads>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>

<dependencies>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-jdk14</artifactId>
<version>${slf4j.version}</version>
<scope>runtime</scope>
</dependency>

<!-- Apache Beam
To run on another of the Beam runners, add its module to this pom.xml
according to the runner-specific setup instructions on the Beam website:
http://beam.apache.org/documentation/#runners
-->
<dependency>
<groupId>org.apache.beam</groupId>
<artifactId>beam-sdks-java-core</artifactId>
<version>${beam.version}</version>
</dependency>

<!-- Direct Runner -->
<dependency>
<groupId>org.apache.beam</groupId>
<artifactId>beam-runners-direct-java</artifactId>
<version>${beam.version}</version>
<scope>runtime</scope>
</dependency>

<!-- Dataflow Runner -->
<dependency>
<groupId>org.apache.beam</groupId>
<artifactId>beam-runners-google-cloud-dataflow-java</artifactId>
<version>${beam.version}</version>
<scope>runtime</scope>
</dependency>

<!-- Google Cloud I/O -->
<dependency>
<groupId>org.apache.beam</groupId>
<artifactId>beam-sdks-java-io-google-cloud-platform</artifactId>
<version>${beam.version}</version>
</dependency>
<dependency>
<groupId>com.google.cloud</groupId>
<artifactId>google-cloud-dataflow</artifactId>
<version>0.34.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.google.cloud</groupId>
<artifactId>google-cloud-artifact-registry</artifactId>
<version>1.29.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.google.cloud</groupId>
<artifactId>google-cloud-storage</artifactId>
<version>2.29.1</version>
</dependency>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13.2</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.example.dataflow;

import java.util.Arrays;
import java.util.List;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.Validation;
import org.apache.beam.sdk.transforms.Create;

/**
* An Apache Beam batch pipeline that writes data to Cloud Storage.
*/
public class FlexTemplateGettingStarted {

public interface Options extends PipelineOptions {
@Description("The Cloud Storage bucket to write to")
@Validation.Required
String getOutput();

void setOutput(String value);
}

// Write text data to Cloud Storage.
public static void main(String[] args) {
final List<String> wordsList = Arrays.asList("1", "2", "3", "4");

var options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
var pipeline = Pipeline.create(options);
pipeline
.apply(Create.of(wordsList))
.apply(TextIO
.write()
.to(options.getOutput())
.withSuffix(".txt")
);

// For a Dataflow Flex Template, do NOT call waitUntilFinish().
pipeline.run();
}
}
Loading

0 comments on commit dc18ac3

Please # to comment.