Update docs (deploy_on_server, dockerize_models, getting_started) and example Bazel files to include AWS Neuron/Trainium/Inferentia deployment guidance.

2023-08-21 09:15:48 +00:00 · 2023-08-21 09:15:48 +00:00 · af0630616c
commit af0630616c
parent 7d24329d0a
6 changed files with 482 additions and 421 deletions
--- a/docs/howtos/deploy_on_server.md
+++ b/docs/howtos/deploy_on_server.md
@ -17,6 +17,7 @@ following arguments to the command line when compiling / running a model:
 - NVIDIA CUDA: `--@zml//runtimes:cuda=true`
 - AMD RoCM: `--@zml//runtimes:rocm=true`
 - Google TPU: `--@zml//runtimes:tpu=true`
+- AWS Trainium/Inferentia 2: `--@zml//runtimes:neuron=true`
 - **AVOID CPU:** `--@zml//runtimes:cpu=false`

 So, to run the OpenLLama model from above **on your development machine**
--- a/docs/howtos/dockerize_models.md
+++ b/docs/howtos/dockerize_models.md
@ -7,11 +7,11 @@ just have to append a few lines to your model's `BUILD.bazel`. Here is how it's
 done.

 **Note:** This walkthrough will work with your installed container runtime, no
-matter if it's **Docker or e.g. Podman.**  Also, we'll create images in the 
+matter if it's **Docker or e.g. Podman.**  Also, we'll create images in the
 [OCI](https://github.com/opencontainers/image-spec) open image format.

 Let's try containerizing our [first model](../tutorials/write_first_model.md), as it
-doesn't need any additional weights files. We'll see [down below](#adding-weights-and-data) 
+doesn't need any additional weights files. We'll see [down below](#adding-weights-and-data)
 how to add those. We'll also see how to add GPU/TPU support for our container
 there.

@ -57,7 +57,7 @@ zig_cc_binary(
 ### 1. The Manifest

 To get started, let's make bazel generate a manifest that will be used when
-creating the TAR archive. 
+creating the TAR archive.

 ```python
 # Manifest created from the simple_layer binary and friends
@ -118,7 +118,7 @@ See how we use string interpolation to fill in the folder name for the
 container's entrypoint?


-Next, we use a transition rule to force the container to be built for 
+Next, we use a transition rule to force the container to be built for
 Linux X86_64:

 ```python
@ -150,10 +150,10 @@ INFO: Build completed successfully, 1 total action

 ### 4. The Load

-While inspecting the image is surely interesting, we usually want to load the 
+While inspecting the image is surely interesting, we usually want to load the
 image so we can run it.

-There is a bazel rule for that: `oci_load`. When we append the following lines 
+There is a bazel rule for that: `oci_load`. When we append the following lines
 to `BUILD.bazel`:

 ```python
@ -218,7 +218,7 @@ how to build Docker images that also contain data files.

 You can `bazel run -c opt //mnist:push -- --repository
 index.docker.io/my_org/zml_mnist` in the `./examples` folder if you want to try
-it out. 
+it out.

 **Note: Please add one more of the following parameters to specify all the
 platforms your containerized model should support.**
@ -226,6 +226,7 @@ platforms your containerized model should support.**
 - NVIDIA CUDA: `--@zml//runtimes:cuda=true`
 - AMD RoCM: `--@zml//runtimes:rocm=true`
 - Google TPU: `--@zml//runtimes:tpu=true`
+- AWS Trainium/Inferentia 2: `--@zml//runtimes:neuron=true`
 - **AVOID CPU:** `--@zml//runtimes:cpu=false`

 **Example:**
@ -337,7 +338,7 @@ oci_image(
    name = "image_",
    base = "@distroless_cc_debian12",
    # the entrypoint comes from the expand_template rule `entrypoint` above
-    entrypoint = ":entrypoint", 
+    entrypoint = ":entrypoint",
    tars = [":archive"],
 )

--- a/docs/tutorials/getting_started.md
+++ b/docs/tutorials/getting_started.md
@ -115,6 +115,7 @@ following arguments to the command line when compiling or running a model:
 - NVIDIA CUDA: `--@zml//runtimes:cuda=true`
 - AMD RoCM: `--@zml//runtimes:rocm=true`
 - Google TPU: `--@zml//runtimes:tpu=true`
+- AWS Trainium/Inferentia 2: `--@zml//runtimes:neuron=true`
 - **AVOID CPU:** `--@zml//runtimes:cpu=false`

 The latter, avoiding compilation for CPU, cuts down compilation time.
--- a/examples/MODULE.bazel
+++ b/examples/MODULE.bazel
@ -4,9 +4,9 @@ bazel_dep(name = "bazel_skylib", version = "1.7.1")
 bazel_dep(name = "rules_zig", version = "20240913.0-1957d05")
 bazel_dep(name = "platforms", version = "0.0.10")
 bazel_dep(name = "zml", version = "0.1.0")
-bazel_dep(name = "aspect_bazel_lib", version = "2.8.1.1")
-
+bazel_dep(name = "aspect_bazel_lib", version = "2.9.3")
 bazel_dep(name = "rules_oci", version = "2.0.0")
+
 oci = use_extension("@rules_oci//oci:extensions.bzl", "oci")
 oci.pull(
    name = "distroless_cc_debian12",
@ -17,6 +17,15 @@ oci.pull(
    ],
 )
 use_repo(oci, "distroless_cc_debian12", "distroless_cc_debian12_linux_amd64")
+oci.pull(
+    name = "distroless_cc_debian12_debug",
+    digest = "sha256:ae6f470336acbf2aeffea3db70ec0e74d69bee7270cdb5fa2f28fe840fad57fe",
+    image = "gcr.io/distroless/cc-debian12",
+    platforms = [
+        "linux/amd64",
+    ],
+)
+use_repo(oci, "distroless_cc_debian12_debug", "distroless_cc_debian12_debug_linux_amd64")

 # Mnist weights
 http_file = use_repo_rule("@bazel_tools//tools/build_defs/repo:http.bzl", "http_file")
@ -37,7 +46,6 @@ http_file(

 # Llama weights
 huggingface = use_extension("@zml//bazel:huggingface.bzl", "huggingface")
-
 huggingface.model(
    name = "Karpathy-TinyLlama-Stories",
    build_file_content = """\
@ -101,7 +109,6 @@ filegroup(
    model = "meta-llama/Meta-Llama-3.1-8B-Instruct",
 )
 use_repo(huggingface, "Meta-Llama-3.1-8B-Instruct")
-
 huggingface.model(
    name = "Meta-Llama-3.1-70B-Instruct",
    build_file_content = """\
@ -125,7 +132,6 @@ filegroup(
    model = "meta-llama/Meta-Llama-3.1-70B-Instruct",
 )
 use_repo(huggingface, "Meta-Llama-3.1-70B-Instruct")
-
 huggingface.model(
    name = "TinyLlama-1.1B-Chat-v1.0",
    build_file_content = """\
@ -149,7 +155,6 @@ filegroup(
    model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
 )
 use_repo(huggingface, "TinyLlama-1.1B-Chat-v1.0")
-
 huggingface.model(
    name = "OpenLM-Research-OpenLLaMA-3B",
    build_file_content = """\
--- a/examples/MODULE.bazel.lock
+++ b/examples/MODULE.bazel.lock
--- a/examples/llama/BUILD.bazel
+++ b/examples/llama/BUILD.bazel
@ -150,9 +150,12 @@ tar(

 oci_image(
    name = "image_",
-    base = "@distroless_cc_debian12",
+    base = "@distroless_cc_debian12_debug",
    entrypoint = ["./{}/llama".format(package_name())],
-    tars = [":archive"],
+    tars = [
+        "@zml//runtimes:layers",
+        ":archive",
+    ],
 )

 platform_transition_filegroup(