cuDNN for llama3 #819

Workflow file for this run

	name: GPU Builds and Tests

	on:
	create:
	workflow_dispatch:
	push:
	branches:
	- master
	pull_request:
	branches:
	- master
	- llama3

	jobs:
	build-and-test-gpt2:
	runs-on: ubicloud-gpu-standard-1-latest

	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Install OpenMP
	run: sudo apt-get update && sudo apt-get install -y libomp-dev

	- name: Install dependencies
	run: pip install -r requirements.txt

	- name: Run preprocessing
	run: python dev/data/tinyshakespeare.py

	- name: Train model
	run: python train_gpt2.py

	- name: Compile training and testing program
	run: make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu

	- name: Train model (With OpenMP)
	run: OMP_NUM_THREADS=8 ./train_gpt2cu

	- name: Train model (FP32) with gpt2_124M.bin
	run: \|
	PRECISION=FP32 make train_gpt2cu
	./train_gpt2cu -b 1 -t 64 -d 256 -l 0.0001 -v 200 -s 200 -a 1 -x 10 -r 0 -f 0 -e "gpt2_124M.bin"

	- name: Test for percent loss differential for FP32
	run: \|
	PRECISION=FP32 make train_gpt2cu
	./train_gpt2cu -b 1 -t 64 -d 256 -l 0.0001 -v 200 -s 200 -a 1 -x 10 -r 0 -f 0 -e "gpt2_124M.bin" > train_gpt2cu_fp32_precision.txt
	python dev/loss_checker_ci.py -f train_gpt2cu_fp32_precision.txt -s 20 -e 28 -a 5.0

	- name: Build FP32 precision
	run: PRECISION=FP32 make test_gpt2cu profile_gpt2cu

	- name: Run default
	run: ./test_gpt2cu

	- name: Run no recompute GeLU
	run: ./test_gpt2cu -r 0

	- name: Run recompute LN
	run: ./test_gpt2cu -r 2

	- name: Build BF16 precision
	run: PRECISION=BF16 make train_gpt2cu test_gpt2cu profile_gpt2cu

	- name: Run default
	run: ./test_gpt2cu

	- name: Run no recompute GeLU
	run: ./test_gpt2cu -r 0

	- name: Run no master weights
	run: ./test_gpt2cu -w 0

	- name: Run recompute LN
	run: ./test_gpt2cu -r 2

	- name: Train model fp32 (With OpenMP)
	run: OMP_NUM_THREADS=8 ./train_gpt2fp32cu

	- name: Execute testing program (With OpenMP)
	run: OMP_NUM_THREADS=8 ./test_gpt2cu

	- name: Execute testing program fp32 (With OpenMP)
	run: OMP_NUM_THREADS=8 ./test_gpt2fp32cu

	- name: Compile training and testing program without OpenMP
	run: NO_OMP=1 make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu

	- name: Train model (No OpenMP)
	run: NO_OMP=1 ./train_gpt2cu

	- name: Train model fp32 (No OpenMP)
	run: NO_OMP=1 ./train_gpt2fp32cu

	- name: Execute testing program (No OpenMP)
	run: ./test_gpt2cu -b 32

	- name: Execute testing program fp32 (No OpenMP)
	run: ./test_gpt2fp32cu

	- name: Install cuDNN-frontend
	run:
	git clone https://github.com/NVIDIA/cudnn-frontend.git

	- name: Build with cuDNN
	run: USE_CUDNN=1 make test_gpt2cu train_gpt2cu

	- name: Train model with cuDNN
	run: ./train_gpt2cu

	- name: Execute testing program with cuDNN
	run: ./test_gpt2cu

	build-and-test-llama3:
	name: Build and test LLama3.2 1B
	runs-on: ubicloud-gpu-standard-1-latest
	container:
	image: nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04
	env:
	HF_TOKEN: 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
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	- run: echo "::add-mask::$HF_TOKEN"

	- name: Install OpenMP
	run: apt-get update && apt-get install -y libomp-dev libopenmpi-dev python3-pip

	- name: Install dependencies
	run: pip install -r requirements.txt

	- name: Run preprocessing
	run: python3 dev/data/tinyshakespeare.py --model_desc llama-3

	- name: Train model
	# use the first 10 layers, so that everything fits into the 20GB of
	# the A4000 Ada that we have in CI
	run: python3 train_llama3.py --write_tensors 1 --dtype float32 --depth 10

	- name: Build FP32 precision
	run: PRECISION=FP32 NO_MULTI_GPU=1 make test_llama3cu

	- name: Run default
	run: ./test_llama3cu

	- name: Run no recompute GeLU
	run: ./test_llama3cu -r 0

	- name: Run recompute LN
	run: ./test_llama3cu -r 2

	- name: Build BF16 precision
	run: PRECISION=BF16 NO_MULTI_GPU=1 make train_llama3cu test_llama3cu

	- name: Run default (BF16)
	run: ./test_llama3cu

	- name: Run no recompute GeLU (BF16)
	run: ./test_llama3cu -r 0

	- name: Run no master weights (BF16)
	run: ./test_llama3cu -w 0

	- name: Run recompute LN (BF16)
	run: ./test_llama3cu -r 2

	build-and-test-llama3-untied:
	name: Build and test LLama3.2 1B with untie weights
	runs-on: ubicloud-gpu-standard-1-latest
	env:
	HF_TOKEN: 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
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	- run: echo "::add-mask::$HF_TOKEN"

	- name: Install OpenMP
	run: sudo apt-get update && sudo apt-get install -y libomp-dev git

	- name: Install dependencies
	run: pip install -r requirements.txt

	- name: Run preprocessing
	run: python dev/data/tinyshakespeare.py --model_desc llama-3

	- name: Train model
	run: python train_llama3.py --write_tensors 1 --dtype float32 --untie 1 --depth 10

	- name: Build FP32 precision
	run: PRECISION=FP32 make test_llama3cu

	- name: Run default
	run: ./test_llama3cu

	- name: Build BF16 precision
	run: PRECISION=BF16 make train_llama3cu test_llama3cu

	- name: Run default
	run: ./test_llama3cu

	- name: Install cuDNN-frontend
	run:
	git clone https://github.com/NVIDIA/cudnn-frontend.git

	- name: Build with cuDNN
	run: USE_CUDNN=1 PRECISION=BF16 NO_MULTI_GPU=1 make train_llama3cu test_llama3cu

	- name: Train model with cuDNN
	run: ./train_llama3cu

	- name: Execute testing program with cuDNN
	run: ./test_llama3cu

	unit-tests-gpu:
	runs-on: ubicloud-gpu-standard-1-latest

	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Test Device<->File IO
	run: cd dev/test && nvcc -o device_file_io device_file_io.cu && ./device_file_io

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

cuDNN for llama3 #819

Workflow file

cuDNN for llama3 #819

Uh oh!

Jobs

Run details

Workflow file for this run