diff --git a/.github/workflows/batch-processing.yml b/.github/workflows/batch-processing.yml new file mode 100644 index 000000000..64508d1e9 --- /dev/null +++ b/.github/workflows/batch-processing.yml @@ -0,0 +1,17 @@ +name: batch-processing tests +on: [push] +defaults: + run: + working-directory: batch-processing +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Go + uses: actions/setup-go@v3 + with: + go-version-file: "batch-processing/go.mod" + cache: false + - name: Test + run: make test diff --git a/batch-processing/.bash_history b/batch-processing/.bash_history new file mode 100644 index 000000000..5e2e79264 --- /dev/null +++ b/batch-processing/.bash_history @@ -0,0 +1 @@ +go run . --input /inputs/example.csv --output /outputs/example-result.csv diff --git a/batch-processing/.gitignore b/batch-processing/.gitignore new file mode 100644 index 000000000..bcc41141f --- /dev/null +++ b/batch-processing/.gitignore @@ -0,0 +1 @@ +docker_env \ No newline at end of file diff --git a/batch-processing/Dockerfile b/batch-processing/Dockerfile new file mode 100644 index 000000000..f73c14816 --- /dev/null +++ b/batch-processing/Dockerfile @@ -0,0 +1,76 @@ +# syntax=docker/dockerfile:1 +## +## BASE +## +FROM golang:1.19-bullseye as base + +# Ignore APT warnings about not having a TTY +ENV DEBIAN_FRONTEND noninteractive + +# install build essentials +RUN apt-get update && \ + apt-get install -y wget build-essential pkg-config --no-install-recommends + +# Install ImageMagick deps +RUN apt-get -q -y install libjpeg-dev libpng-dev libtiff-dev \ + libgif-dev libx11-dev --no-install-recommends + +ENV IMAGEMAGICK_VERSION=6.9.10-11 + +# Install ImageMagick +RUN cd && \ + wget https://github.com/ImageMagick/ImageMagick6/archive/${IMAGEMAGICK_VERSION}.tar.gz && \ + tar xvzf ${IMAGEMAGICK_VERSION}.tar.gz && \ + cd ImageMagick* && \ + ./configure \ + --without-magick-plus-plus \ + --without-perl \ + --disable-openmp \ + --with-gvc=no \ + --disable-docs && \ + make -j$(nproc) && make install && \ + ldconfig /usr/local/lib + +# Build the app +WORKDIR /app + +COPY go.mod ./ +COPY go.sum ./ + +RUN go mod download + +COPY *.go ./ +RUN mkdir -p /inputs /outputs + +# This is required for test and run, but for develop it ensures we have a build cache +RUN go build -o /out + +# Set up environment +ENV AWS_REGION "" +ENV AWS_ROLE_ARN "" +ENV S3_BUCKET "" + +## +## TEST +## +FROM base as test + +ENTRYPOINT [ "go", "test", "-v" ] + +## +## DEVELOP +## +FROM base as develop + +RUN mkdir -p /root/.aws /root/.cache/go-build +COPY .bash_history /root/.bash_history +ENTRYPOINT [ "/bin/bash" ] + +## +## RUN +## +FROM base as run + +WORKDIR / +ENTRYPOINT ["/out"] +CMD ["--input", "/inputs/example.csv", "--output", "/outputs/example-result.csv"] \ No newline at end of file diff --git a/batch-processing/IMPLEMENTATION.md b/batch-processing/IMPLEMENTATION.md new file mode 100644 index 000000000..b0c843fc0 --- /dev/null +++ b/batch-processing/IMPLEMENTATION.md @@ -0,0 +1,176 @@ +# batch-processing + +See https://github.com/CodeYourFuture/immersive-go-course/issues/26 for context. + +## Plan + +The planned architecture of this: + +1. Read the CSV +2. Download the images to a location (`/tmp`) +3. Use imagemagick to monochrome them +4. Upload them to S3 +5. Return the URL + +I tried [an initial implementation](https://github.com/CodeYourFuture/immersive-go-course/pull/46) of this that went a long way, but that I didn't like in the end. + +The first step will be to build this linearly, and to write tests as we go. Because there is real file getting and writing, we will run integration tests in Docker: + +1. Mock the `jpg` get +2. Write a real file +3. Mock S3 methods using [s3iface](https://docs.aws.amazon.com/sdk-for-go/api/service/s3/s3iface/) + +Then use goroutines to run it in parallel, likely by wrapping the output in a mutex and locking/unlocking as the goroutine completes: https://pkg.go.dev/sync#Mutex + +A possible last extension would be to use channels: https://go.dev/blog/pipelines + +## Downloads + +The download is simple — create a file in a temporary location, and `http.Get` into it with `io.Copy`. + +## `imagemagick` + +To run ImageMagick (and this whole thing) in a repeatable way, we will do it all in a Docker container based on `dpokidov/imagemagick:latest-bullseye` using multi-stage build. This will give us the `magick` command. + +To be able to run the tests and the app, we end up with multiple targets: + +```Dockerfile +FROM golang:1.19-bullseye as base + +# ... install dependencies & build ... + +FROM base as test + +# ... run tests ... + +FROM base as run + +# ... run app ... +``` + +Which can then be built by specifying the `--target`: + +```console +> docker build --target test -t test . +``` + +## Grayscale + +`convert`, accessed via `ConvertImageCommand`, with `-set colorspace Gray -separate -average` seems to work well. + +## Developing + +We can run locally. A few things are needed. + +In VSCode settings, if using the go extension: + +```json +"gopls": { + "build.env": { + "CGO_CFLAGS_ALLOW": "-Xpreprocessor" + } +} +``` + +On the CLI: + +```console +export PKG_CONFIG_PATH="/usr/local/opt/imagemagick@6/lib/pkgconfig" +``` + +### Developing in Docker + +To develop the app with Docker, we need a slightly fancier command: + +```Makefile +develop: + mkdir -p mount + docker build --target develop -t develop . + docker run -it --mount type=bind,source="$$(pwd)",target=/app --mount type=bind,source="/tmp",target=/tmp --rm develop + rm -rf ./mount +``` + +## Upload to S3 + +- Get credentials set up — https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html +- `brew install awscli` +- `aws configure` + +Follow upload example here: `https://github.com/aws/aws-sdk-go` + +We need to mount creds from host: `--mount type=bind,source="$$(echo $$HOME)/.aws",target=/root/.aws` + +Create `S3ReadWriteGoCourse` policy for IAM role: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "ListObjectsInBucket", + "Effect": "Allow", + "Action": ["s3:ListBucket"], + "Resource": ["arn:aws:s3:::[ID]"] + }, + { + "Sid": "AllObjectActions", + "Effect": "Allow", + "Action": "s3:*Object", + "Resource": ["arn:aws:s3:::[ID]/*"] + } + ] +} +``` + +Create `GoCourseLambdaUserReadWriteS3` Role allowing accounts + Lambda to read/write, trust policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws:iam::[ID]:root" + }, + "Action": "sts:AssumeRole" + }, + { + "Effect": "Allow", + "Principal": { + "Service": "lambda.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] +} +``` + +We can then load using ARN passed via env: + +```go +// Set up S3 session +// All clients require a Session. The Session provides the client with +// shared configuration such as region, endpoint, and credentials. +sess := session.Must(session.NewSession()) + +// Create the credentials from AssumeRoleProvider to assume the role +// referenced by the ARN. +creds := stscreds.NewCredentials(sess, awsRoleArn) + +// Create service client value configured for credentials +// from assumed role. +svc := s3.New(sess, &aws.Config{Credentials: creds}) +``` + +Need to create a `docker_env` file with config: + +```env +AWS_REGION=eu-west-1 +AWS_ROLE_ARN=arn:aws:iam::[ID]:role/GoCourseLambdaUserReadWriteS3 +S3_BUCKET=[ID] +``` + +## Output + +Write CSV with the input and output together. diff --git a/batch-processing/Makefile b/batch-processing/Makefile new file mode 100644 index 000000000..eb21812c9 --- /dev/null +++ b/batch-processing/Makefile @@ -0,0 +1,31 @@ +SHELL=/bin/bash +.PHONY: run test develop + +outputs: + mkdir -p outputs + +run: outputs + docker build --target run -t run . + docker run \ + --env-file docker_env \ + --mount type=bind,source="$$(echo $$HOME)/.aws",target=/root/.aws \ + --mount type=bind,source="$$(pwd)/inputs",target=/inputs \ + --mount type=bind,source="$$(pwd)/outputs",target=/outputs \ + --rm run + +test: outputs + docker build --target test -t test . + docker run \ + --rm test + +develop: outputs + docker build --target develop -t develop . + docker run -it \ + --env-file docker_env \ + --mount type=bind,source="$$(go env GOCACHE)",target=/root/.cache/go-build \ + --mount type=bind,source="$$(echo $$HOME)/.aws",target=/root/.aws \ + --mount type=bind,source="$$(pwd)",target=/app \ + --mount type=bind,source="$$(pwd)/inputs",target=/inputs \ + --mount type=bind,source="$$(pwd)/outputs",target=/outputs \ + --mount type=bind,source="/tmp",target=/tmp \ + --rm develop \ No newline at end of file diff --git a/batch-processing/go.mod b/batch-processing/go.mod new file mode 100644 index 000000000..6036d83b0 --- /dev/null +++ b/batch-processing/go.mod @@ -0,0 +1,9 @@ +module github.com/CodeYourFuture/immersive-go-course/batch-processing + +go 1.19 + +require ( + github.com/aws/aws-sdk-go v1.44.109 // indirect + github.com/jmespath/go-jmespath v0.4.0 // indirect + gopkg.in/gographics/imagick.v2 v2.6.2 // indirect +) diff --git a/batch-processing/go.sum b/batch-processing/go.sum new file mode 100644 index 000000000..8487393ca --- /dev/null +++ b/batch-processing/go.sum @@ -0,0 +1,19 @@ +github.com/aws/aws-sdk-go v1.44.109 h1:+Na5JPeS0kiEHoBp5Umcuuf+IDqXqD0lXnM920E31YI= +github.com/aws/aws-sdk-go v1.44.109/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= +github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= +github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/gographics/imagick.v2 v2.6.2 h1:8ILTJzDKQKSYSfav+9GZs9H8zOOR2UtZVTWkUdFoiZ8= +gopkg.in/gographics/imagick.v2 v2.6.2/go.mod h1:/QVPLV/iKdNttRKthmDkeeGg+vdHurVEPc8zkU0XgBk= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/batch-processing/inputs/download-error.csv b/batch-processing/inputs/download-error.csv new file mode 100644 index 000000000..b090cf2e4 --- /dev/null +++ b/batch-processing/inputs/download-error.csv @@ -0,0 +1,2 @@ +url +https://images.unsplash.com/photo-does-not-exist \ No newline at end of file diff --git a/batch-processing/main.go b/batch-processing/main.go new file mode 100644 index 000000000..55c754596 --- /dev/null +++ b/batch-processing/main.go @@ -0,0 +1,224 @@ +package main + +import ( + "bytes" + "encoding/csv" + "flag" + "fmt" + "io" + "log" + "math/rand" + "net/http" + "os" + "path/filepath" + "time" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/credentials/stscreds" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/s3" + "gopkg.in/gographics/imagick.v2/imagick" +) + +type Config struct { + AwsRoleArn string + AwsRegion string + S3Bucket string +} + +type Row struct { + index int + url string + inputFilepath string + outputFilepath string + outputKey string + outputUrl string +} + +func readAndValidateCsv(in io.Reader) ([][]string, error) { + r := csv.NewReader(in) + records, err := r.ReadAll() + if err != nil { + return [][]string{}, err + } + + if len(records) <= 1 { + return [][]string{}, fmt.Errorf("empty csv") + } + + headerRow := records[0] + if len(headerRow) == 0 || headerRow[0] != "url" { + return [][]string{}, fmt.Errorf("incorrect column name: expected \"url\", got %q", headerRow[0]) + } + + return records, nil +} + +func (row Row) handleRow(svc *s3.S3, config *Config) error { + i, url, inputFilepath, outputFilepath := row.index, row.url, row.inputFilepath, row.outputFilepath + // Create a new file that we will write to + inputFile, err := os.Create(inputFilepath) + if err != nil { + return fmt.Errorf("error: row %d (%q): %v", i, url, err) + } + defer inputFile.Close() + + // Get it from the internet! + res, err := http.Get(url) + if err != nil { + return fmt.Errorf("error: row %d (%q): %v", i, url, err) + } + defer res.Body.Close() + + // Ensure we got success from the server + if res.StatusCode != http.StatusOK { + return fmt.Errorf("error: download failed: row %d (%q): %s", i, url, res.Status) + } + + // Copy the body of the response to the created file + _, err = io.Copy(inputFile, res.Body) + if err != nil { + return fmt.Errorf("error: row %d (%q): %v", i, url, err) + } + + // Convert the image to grayscale using imagemagick + // We are directly calling the convert command + _, err = imagick.ConvertImageCommand([]string{ + "convert", inputFilepath, "-set", "colorspace", "Gray", outputFilepath, + }) + if err != nil { + return fmt.Errorf("error: row %d (%q): %v", i, url, err) + } + + log.Printf("processed: row %d (%q) to %q\n", i, url, outputFilepath) + + outputFile, err := os.Open(outputFilepath) + if err != nil { + return fmt.Errorf("error: row %d (%q): %v", i, url, err) + } + + // Uploads the object to S3. The Context will interrupt the request if the + // timeout expires. + _, err = svc.PutObject(&s3.PutObjectInput{ + Bucket: aws.String(config.S3Bucket), + Key: aws.String(row.outputKey), + Body: outputFile, + }) + + if err != nil { + return fmt.Errorf("error: row %d (%q): %v", i, url, err) + } + + return nil +} + +func main() { + // We need a file to read from... + inputCsv := flag.String("input", "", "A path to a CSV with a `url` column, containing URLs for images to be processed") + // ... and a file to write to + outputCsv := flag.String("output", "", "Location that the output of this command should be written") + + flag.Parse() + if *inputCsv == "" || *outputCsv == "" { + flag.Usage() + os.Exit(1) + } + + awsRoleArn := os.Getenv("AWS_ROLE_ARN") + if awsRoleArn == "" { + log.Fatalln("Please set AWS_ROLE_ARN environment variable") + } + awsRegion := os.Getenv("AWS_REGION") + if awsRegion == "" { + log.Fatalln("Please set AWS_REGION environment variable") + } + s3Bucket := os.Getenv("S3_BUCKET") + if s3Bucket == "" { + log.Fatalln("Please set S3_BUCKET environment variable") + } + + config := &Config{ + AwsRoleArn: awsRoleArn, + AwsRegion: awsRegion, + S3Bucket: s3Bucket, + } + + // Set up S3 session + // All clients require a Session. The Session provides the client with + // shared configuration such as region, endpoint, and credentials. + sess := session.Must(session.NewSession()) + + // Create the credentials from AssumeRoleProvider to assume the role + // referenced by the ARN. + creds := stscreds.NewCredentials(sess, config.AwsRoleArn) + + // Create service client value configured for credentials + // from assumed role. + svc := s3.New(sess, &aws.Config{Credentials: creds}) + + // Set up imagemagick + imagick.Initialize() + defer imagick.Terminate() + + // Open the file supplied + in, err := os.Open(*inputCsv) + if err != nil { + log.Fatal(err) + } + + // Read the file using the encoding/csv package + inputRecords, err := readAndValidateCsv(in) + if err != nil { + log.Fatal(err) + } + + outputRecords := make([][]string, 0, len(inputRecords)-1) + outputRecords = append(outputRecords, []string{"url", "input", "output", "s3url"}) + + for i, row := range inputRecords[1:] { + url := row[0] + + prefix := fmt.Sprintf("/tmp/%d-%d", time.Now().UnixMilli(), rand.Int()) + inputFilepath := fmt.Sprintf("%s.%s", prefix, "jpg") + outputFilepath := fmt.Sprintf("%s-out.%s", prefix, "jpg") + // Upload just using the final part of the output filepath + outputKey := filepath.Base(outputFilepath) + outputUrl := fmt.Sprintf("https://%s.s3.%s.amazonaws.com/%s", config.S3Bucket, config.AwsRegion, outputKey) + + log.Printf("downloading: row %d (%q) to %q\n", i, url, inputFilepath) + + row := Row{ + index: i, + url: url, + inputFilepath: inputFilepath, + outputFilepath: outputFilepath, + outputKey: outputKey, + outputUrl: outputUrl, + } + + err := row.handleRow(svc, config) + if err != nil { + log.Printf("error: row %d (%q): %v", i, url, err) + continue + } + + outputRecords = append(outputRecords, []string{row.url, row.inputFilepath, row.outputFilepath, row.outputUrl}) + + log.Printf("uploaded: row %d (%q) to %s\n", i, url, outputUrl) + } + + // Turn the output records into a CSV + buf := new(bytes.Buffer) + w := csv.NewWriter(buf) + err = w.WriteAll(outputRecords) + if err != nil { + log.Fatalf("failed to create CSV from output records: %v\n", err) + } + err = os.WriteFile(*outputCsv, buf.Bytes(), os.FileMode(0644)) + if err != nil { + log.Fatalf("failed to write output records to file: %v\n", err) + } + + log.Printf("output: %q", *outputCsv) + log.Printf("summary: %d of %d uploaded", len(outputRecords), len(inputRecords)) +} diff --git a/batch-processing/main_test.go b/batch-processing/main_test.go new file mode 100644 index 000000000..f02c7574f --- /dev/null +++ b/batch-processing/main_test.go @@ -0,0 +1,43 @@ +package main + +import ( + "strings" + "testing" +) + +func TestReadValidateCsv(t *testing.T) { + in := strings.NewReader(`url +http://host/path.jpg`) + records, err := readAndValidateCsv(in) + if err != nil { + t.Fatal(err) + } + if len(records) != 2 { + t.Fatalf("records incorrect length: expected 2, got %d\n", len(records)) + } +} + +func TestReadValidateCsvHeaderValidation(t *testing.T) { + in := strings.NewReader(`nope +http://host/path.jpg`) + records, err := readAndValidateCsv(in) + if err == nil { + t.Fatalf("expected error: got %v\n", records) + } +} + +func TestReadValidateCsvEmptyCsv(t *testing.T) { + in := strings.NewReader("") + records, err := readAndValidateCsv(in) + if err == nil { + t.Fatalf("expected error: got %v\n", records) + } +} + +func TestReadValidateCsvEmptyBody(t *testing.T) { + in := strings.NewReader(`url`) + records, err := readAndValidateCsv(in) + if err == nil { + t.Fatalf("expected error: got %v\n", records) + } +}