CUDA: tighter VRAM scratch size for 65b/70b (#2551)

This commit is contained in:
Johannes Gäßler 2023-08-08 14:38:16 +02:00 committed by GitHub
parent 7ed8d1fe7f
commit acfc5478ff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -149,7 +149,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
} }
// amount of VRAM needed per batch size to hold temporary results // amount of VRAM needed per batch size to hold temporary results
// the values for 3b and 65b are not derived from testing but instead chosen conservatively // the values for 3b are not derived from testing but instead chosen conservatively
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE() static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
{ {
static std::map<e_model, size_t> k_sizes = { static std::map<e_model, size_t> k_sizes = {
@ -157,14 +157,14 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
{ MODEL_7B, 512ull * kB }, { MODEL_7B, 512ull * kB },
{ MODEL_13B, 640ull * kB }, { MODEL_13B, 640ull * kB },
{ MODEL_30B, 768ull * kB }, { MODEL_30B, 768ull * kB },
{ MODEL_65B, 1536ull * kB }, { MODEL_65B, 1280ull * kB },
{ MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced) { MODEL_70B, 1280ull * kB },
}; };
return k_sizes; return k_sizes;
} }
// amount of VRAM needed per batch size and context to hold temporary results // amount of VRAM needed per batch size and context to hold temporary results
// the values for 3b and 65b are not derived from testing but instead chosen conservatively // the values for 3b are not derived from testing but instead chosen conservatively
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT() static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
{ {
static std::map<e_model, size_t> k_sizes = { static std::map<e_model, size_t> k_sizes = {
@ -172,8 +172,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
{ MODEL_7B, 128ull }, { MODEL_7B, 128ull },
{ MODEL_13B, 160ull }, { MODEL_13B, 160ull },
{ MODEL_30B, 208ull }, { MODEL_30B, 208ull },
{ MODEL_65B, 416ull }, { MODEL_65B, 256ull },
{ MODEL_70B, 416ull }, // TODO (likely can be reduced) { MODEL_70B, 256ull },
}; };
return k_sizes; return k_sizes;
} }