mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 19:21:46 +00:00
CUDA: tighter VRAM scratch size for 65b/70b (#2551)
This commit is contained in:
parent
7ed8d1fe7f
commit
acfc5478ff
12
llama.cpp
12
llama.cpp
@ -149,7 +149,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
|||||||
}
|
}
|
||||||
|
|
||||||
// amount of VRAM needed per batch size to hold temporary results
|
// amount of VRAM needed per batch size to hold temporary results
|
||||||
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
|
// the values for 3b are not derived from testing but instead chosen conservatively
|
||||||
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
||||||
{
|
{
|
||||||
static std::map<e_model, size_t> k_sizes = {
|
static std::map<e_model, size_t> k_sizes = {
|
||||||
@ -157,14 +157,14 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
|||||||
{ MODEL_7B, 512ull * kB },
|
{ MODEL_7B, 512ull * kB },
|
||||||
{ MODEL_13B, 640ull * kB },
|
{ MODEL_13B, 640ull * kB },
|
||||||
{ MODEL_30B, 768ull * kB },
|
{ MODEL_30B, 768ull * kB },
|
||||||
{ MODEL_65B, 1536ull * kB },
|
{ MODEL_65B, 1280ull * kB },
|
||||||
{ MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
|
{ MODEL_70B, 1280ull * kB },
|
||||||
};
|
};
|
||||||
return k_sizes;
|
return k_sizes;
|
||||||
}
|
}
|
||||||
|
|
||||||
// amount of VRAM needed per batch size and context to hold temporary results
|
// amount of VRAM needed per batch size and context to hold temporary results
|
||||||
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
|
// the values for 3b are not derived from testing but instead chosen conservatively
|
||||||
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
||||||
{
|
{
|
||||||
static std::map<e_model, size_t> k_sizes = {
|
static std::map<e_model, size_t> k_sizes = {
|
||||||
@ -172,8 +172,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
|||||||
{ MODEL_7B, 128ull },
|
{ MODEL_7B, 128ull },
|
||||||
{ MODEL_13B, 160ull },
|
{ MODEL_13B, 160ull },
|
||||||
{ MODEL_30B, 208ull },
|
{ MODEL_30B, 208ull },
|
||||||
{ MODEL_65B, 416ull },
|
{ MODEL_65B, 256ull },
|
||||||
{ MODEL_70B, 416ull }, // TODO (likely can be reduced)
|
{ MODEL_70B, 256ull },
|
||||||
};
|
};
|
||||||
return k_sizes;
|
return k_sizes;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user