Skip to content
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ struct common_params {
bool kv_unified = false; // enable unified KV cache

bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool use_mmap = true; // use mmap for faster loads
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing this to false by default, results in a huge slowdown on MacOS with default arguments:

time ./bin/llama-completion -m ../models/gpt-oss-120b/ggml-model-mxfp4.gguf -p "hello" -n 1 -no-cnv

# master
real	0m4.648s

# PR
real	0m17.957s

Not sure what is the best way to handle this. If we keep it true, then linux users would not get the benefit of Direct IO. If we switch to false, Mac users will take the hit.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be OK to set mmap depending on the platform?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't have such precedent atm for any of the parameters in common, so I would say it's not ideal.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have on M4 Pro with GPT-OSS-20B on cold load --no-mmap: 4.168s --mmap: 6.3s. The warm load however takes with --mmap 2.1s (--no-mmap still ~4.1s).

Measured using time ./llama-cli -m /Users/jtischbein/Documents/models/openai_gpt-oss-20b-MXFP4.gguf --no-mmap -p "bla" -n 0 --single-turn and filesystem cache cleared using purge.

So the cold load time is still faster using --mmap, but unfortunately not as fast as on Linux.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can do the following:

  • Add new CLI argument --direct-io, -dio
  • Description: "Use DirectIO if available. Takes precedence over --mmap"
  • Keep use_mmap == true and use_direct_io == true
  • On Mac, the internal implementation will determine that DIO is not available so it will fallback to mmap

Might want to do it in a separate PR as it would require changes in libllama API. This PR should keep use_mmap == true by default.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good

bool use_mmap = false; // use uncached reads for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
Expand Down
145 changes: 117 additions & 28 deletions src/llama-mmap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@
#ifdef __has_include
#if __has_include(<unistd.h>)
#include <unistd.h>
#include <fcntl.h>
#include <sys/stat.h>
#if defined(_POSIX_MAPPED_FILES)
#include <sys/mman.h>
#include <fcntl.h>
#endif
#if defined(_POSIX_MEMLOCK_RANGE)
#include <sys/resource.h>
Expand Down Expand Up @@ -74,7 +75,7 @@ struct llama_file::impl {
return ret;
}

impl(const char * fname, const char * mode) {
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
fp = ggml_fopen(fname, mode);
if (fp == NULL) {
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
Expand Down Expand Up @@ -153,13 +154,43 @@ struct llama_file::impl {
write_raw(&val, sizeof(val));
}

bool has_direct_io() const {
return false;
}

void read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const {
throw std::runtime_error("DirectIO is not implemented on Windows.");
}

~impl() {
if (fp) {
std::fclose(fp);
}
}
#else
impl(const char * fname, const char * mode) {
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
#ifdef __linux__
// Try unbuffered I/O for read only
if (use_direct_io && std::strcmp(mode, "rb") == 0) {
fd = open(fname, O_RDONLY | O_DIRECT);

if (fd != -1) {
struct stat file_stats{};
fstat(fd, &file_stats);

size = file_stats.st_size;

off_t ret = lseek(fd, 0, SEEK_SET);
if (ret == -1) {
throw std::runtime_error(format("seek error: %s", strerror(errno)));
}
return;
}

LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
fname, strerror(errno));
}
#endif
fp = ggml_fopen(fname, mode);
if (fp == NULL) {
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
Expand All @@ -170,27 +201,30 @@ struct llama_file::impl {
}

size_t tell() const {
// TODO: this ifdef is never true?
#ifdef _WIN32
__int64 ret = _ftelli64(fp);
#else
long ret = std::ftell(fp);
#endif
if (ret == -1) {
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
if (fd == -1) {
long ret = std::ftell(fp);
if (ret == -1) {
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
}

return (size_t) ret;
}

return (size_t) ret;
off_t pos = lseek(fd, 0, SEEK_CUR);
if (pos == -1) {
throw std::runtime_error(format("lseek error: %s", strerror(errno)));
}
return (size_t) pos;
}

void seek(size_t offset, int whence) const {
// TODO: this ifdef is never true?
#ifdef _WIN32
int ret = _fseeki64(fp, (__int64) offset, whence);
#else
int ret = std::fseek(fp, (long) offset, whence);
#endif
if (ret != 0) {
off_t ret = 0;
if (fd == -1) {
ret = std::fseek(fp, (long) offset, whence);
} else {
ret = lseek(fd, offset, whence);
}
if (ret == -1) {
throw std::runtime_error(format("seek error: %s", strerror(errno)));
}
}
Expand All @@ -200,13 +234,55 @@ struct llama_file::impl {
return;
}
errno = 0;
std::size_t ret = std::fread(ptr, len, 1, fp);
if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno)));
if (fd == -1) {
std::size_t ret = std::fread(ptr, len, 1, fp);
if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (ret != 1) {
throw std::runtime_error("unexpectedly reached end of file");
}
} else {
bool successful = false;
while (!successful) {
off_t ret = read(fd, ptr, len);

if (ret == -1) {
if (errno == EINTR) {
continue; // Interrupted by signal, retry
}
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (ret == 0) {
throw std::runtime_error("unexpectedly reached end of file");
}

successful = true;
}
}
if (ret != 1) {
throw std::runtime_error("unexpectedly reached end of file");
}

void read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const {
off_t aligned_offset = offset & ~(alignment - 1);
off_t offset_from_alignment = offset - aligned_offset;
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);

void * raw_buffer = nullptr;
int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
if (ret != 0) {
throw std::runtime_error(format("posix_memalign failed with error %d", ret));
}

struct aligned_buffer_deleter {
void operator()(void * p) const { free(p); }
};
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);

seek(aligned_offset, SEEK_SET);
read_raw(buffer.get(), bytes_to_read);

uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
}

uint32_t read_u32() const {
Expand All @@ -230,23 +306,33 @@ struct llama_file::impl {
write_raw(&val, sizeof(val));
}

bool has_direct_io() const {
return fd != -1;
}

~impl() {
if (fp) {
if (fd != -1) {
close(fd);
} else {
std::fclose(fp);
}
}
int fd = -1;
#endif

FILE * fp;
size_t size;
FILE * fp{};
size_t size{};
};

llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
llama_file::~llama_file() = default;

size_t llama_file::tell() const { return pimpl->tell(); }
size_t llama_file::size() const { return pimpl->size; }

bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }

int llama_file::file_id() const {
#ifdef _WIN32
return _fileno(pimpl->fp);
Expand All @@ -261,6 +347,9 @@ int llama_file::file_id() const {

void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
void llama_file::read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const
{ pimpl->read_aligned_chunk(offset, dest, size, alignment); }


uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }

Expand Down
4 changes: 3 additions & 1 deletion src/llama-mmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;

struct llama_file {
llama_file(const char * fname, const char * mode);
llama_file(const char * fname, const char * mode, bool use_direct_io = false);
~llama_file();

size_t tell() const;
Expand All @@ -24,11 +24,13 @@ struct llama_file {
void seek(size_t offset, int whence) const;

void read_raw(void * ptr, size_t len) const;
void read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const;
uint32_t read_u32() const;

void write_raw(const void * ptr, size_t len) const;
void write_u32(uint32_t val) const;

bool has_direct_io() const;
private:
struct impl;
std::unique_ptr<impl> pimpl;
Expand Down
Loading
Loading