|
11 | 11 | #include <filesystem> |
12 | 12 | #include <fstream> |
13 | 13 | #include <atomic> |
14 | | -#include <regex> // migration only |
15 | 14 | #include <string> |
16 | 15 | #include <string_view> |
17 | 16 | #include <stdexcept> |
@@ -336,15 +335,9 @@ hf_files get_repo_files(const std::string & repo_id, |
336 | 335 | if (item["lfs"].contains("oid") && item["lfs"]["oid"].is_string()) { |
337 | 336 | file.oid = item["lfs"]["oid"].get<std::string>(); |
338 | 337 | } |
339 | | - if (item["lfs"].contains("size") && item["lfs"]["size"].is_number()) { |
340 | | - file.size = item["lfs"]["size"].get<size_t>(); |
341 | | - } |
342 | 338 | } else if (item.contains("oid") && item["oid"].is_string()) { |
343 | 339 | file.oid = item["oid"].get<std::string>(); |
344 | 340 | } |
345 | | - if (file.size == 0 && item.contains("size") && item["size"].is_number()) { |
346 | | - file.size = item["size"].get<size_t>(); |
347 | | - } |
348 | 341 |
|
349 | 342 | if (!file.oid.empty() && !is_valid_oid(file.oid)) { |
350 | 343 | LOG_WRN("%s: skip invalid oid: %s\n", __func__, file.oid.c_str()); |
@@ -502,271 +495,4 @@ std::string finalize_file(const hf_file & file) { |
502 | 495 | return file.final_path; |
503 | 496 | } |
504 | 497 |
|
505 | | -// delete everything after this line, one day |
506 | | - |
507 | | -// copied from download.cpp without the tag part |
508 | | -struct gguf_split_info { |
509 | | - std::string prefix; // tag included |
510 | | - int index; |
511 | | - int count; |
512 | | -}; |
513 | | - |
514 | | -static gguf_split_info get_gguf_split_info(const std::string & path) { |
515 | | - static const std::regex re_split("^(.+)-([0-9]{5})-of-([0-9]{5})$", std::regex::icase); |
516 | | - std::smatch m; |
517 | | - |
518 | | - std::string prefix = path; |
519 | | - if (!string_remove_suffix(prefix, ".gguf")) { |
520 | | - return {}; |
521 | | - } |
522 | | - |
523 | | - int index = 1; |
524 | | - int count = 1; |
525 | | - |
526 | | - if (std::regex_match(prefix, m, re_split)) { |
527 | | - index = std::stoi(m[2].str()); |
528 | | - count = std::stoi(m[3].str()); |
529 | | - prefix = m[1].str(); |
530 | | - } |
531 | | - |
532 | | - return {std::move(prefix), index, count}; |
533 | | -} |
534 | | - |
535 | | -static std::pair<std::string, std::string> parse_manifest_name(std::string & filename) { |
536 | | - static const std::regex re(R"(^manifest=([^=]+)=([^=]+)=.*\.json$)"); |
537 | | - std::smatch match; |
538 | | - if (std::regex_match(filename, match, re)) { |
539 | | - return {match[1].str(), match[2].str()}; |
540 | | - } |
541 | | - return {}; |
542 | | -} |
543 | | - |
544 | | -static std::string make_old_cache_filename(const std::string & owner, |
545 | | - const std::string & repo, |
546 | | - const std::string & filename) { |
547 | | - auto result = owner + "_" + repo + "_" + filename; |
548 | | - string_replace_all(result, "/", "_"); |
549 | | - return result; |
550 | | -} |
551 | | - |
552 | | -struct migrate_file { |
553 | | - std::string path; |
554 | | - std::string sha256; |
555 | | - size_t size; |
556 | | - fs::path old_path; |
557 | | - fs::path etag_path; |
558 | | - const hf_file * file; |
559 | | -}; |
560 | | - |
561 | | -using migrate_files = std::vector<migrate_file>; |
562 | | - |
563 | | -static bool collect_file(const fs::path & old_cache, |
564 | | - const std::string & owner, |
565 | | - const std::string & repo, |
566 | | - const std::string & path, |
567 | | - const std::string & sha256, |
568 | | - const hf_files & files, |
569 | | - migrate_files & to_migrate) { |
570 | | - |
571 | | - const hf_file * file = nullptr; |
572 | | - |
573 | | - for (const auto & f : files) { |
574 | | - if (f.path == path) { |
575 | | - file = &f; |
576 | | - break; |
577 | | - } |
578 | | - } |
579 | | - |
580 | | - std::string old_filename = make_old_cache_filename(owner, repo, path); |
581 | | - fs::path old_path = old_cache / old_filename; |
582 | | - fs::path etag_path = old_path.string() + ".etag"; |
583 | | - |
584 | | - if (!fs::exists(old_path)) { |
585 | | - if (file && fs::exists(file->final_path)) { |
586 | | - return true; |
587 | | - } |
588 | | - LOG_WRN("%s: %s not found in old cache or HF cache\n", __func__, old_filename.c_str()); |
589 | | - return false; |
590 | | - } |
591 | | - |
592 | | - if (!file) { |
593 | | - LOG_WRN("%s: %s not found in current repo\n", __func__, old_filename.c_str()); |
594 | | - return false; |
595 | | - } |
596 | | - |
597 | | - if (!sha256.empty() && !file->oid.empty() && sha256 != file->oid) { |
598 | | - LOG_WRN("%s: %s is not up to date (sha256 mismatch)\n", __func__, old_filename.c_str()); |
599 | | - return false; |
600 | | - } |
601 | | - |
602 | | - if (file->size > 0) { |
603 | | - size_t size = fs::file_size(old_path); |
604 | | - if (size != file->size) { |
605 | | - LOG_WRN("%s: %s has wrong size %zu (expected %zu)\n", __func__, old_filename.c_str(), size, file->size); |
606 | | - return false; |
607 | | - } |
608 | | - } |
609 | | - |
610 | | - to_migrate.push_back({path, sha256, file->size, old_path, etag_path, file}); |
611 | | - return true; |
612 | | -} |
613 | | - |
614 | | -static bool collect_files(const fs::path & old_cache, |
615 | | - const std::string & owner, |
616 | | - const std::string & repo, |
617 | | - const nl::json & node, |
618 | | - const hf_files & files, |
619 | | - migrate_files & to_migrate) { |
620 | | - |
621 | | - if (!node.contains("rfilename") || |
622 | | - !node.contains("lfs") || |
623 | | - !node["lfs"].contains("sha256")) { |
624 | | - return true; |
625 | | - } |
626 | | - |
627 | | - std::string path = node["rfilename"]; |
628 | | - std::string sha256 = node["lfs"]["sha256"]; |
629 | | - |
630 | | - auto split = get_gguf_split_info(path); |
631 | | - |
632 | | - if (split.count <= 1) { |
633 | | - return collect_file(old_cache, owner, repo, path, sha256, files, to_migrate); |
634 | | - } |
635 | | - |
636 | | - std::vector<std::pair<std::string, std::string>> splits; |
637 | | - |
638 | | - for (const auto & f : files) { |
639 | | - auto split_f = get_gguf_split_info(f.path); |
640 | | - if (split_f.count == split.count && split_f.prefix == split.prefix) { |
641 | | - // sadly the manifest only provides the sha256 of the first file (index == 1) |
642 | | - // the rest will be verified using the size... |
643 | | - std::string f_sha256 = (split_f.index == 1) ? sha256 : ""; |
644 | | - splits.emplace_back(f.path, f_sha256); |
645 | | - } |
646 | | - } |
647 | | - |
648 | | - if ((int)splits.size() != split.count) { |
649 | | - LOG_WRN("%s: expected %d split files but found %d in repo\n", __func__, split.count, (int)splits.size()); |
650 | | - return false; |
651 | | - } |
652 | | - |
653 | | - for (const auto & [f_path, f_sha256] : splits) { |
654 | | - if (!collect_file(old_cache, owner, repo, f_path, f_sha256, files, to_migrate)) { |
655 | | - return false; |
656 | | - } |
657 | | - } |
658 | | - |
659 | | - return true; |
660 | | -} |
661 | | - |
662 | | -static bool migrate_file(const migrate_file & file) { |
663 | | - std::error_code ec; |
664 | | - |
665 | | - fs::path new_path(file.file->local_path); |
666 | | - fs::create_directories(new_path.parent_path(), ec); |
667 | | - |
668 | | - if (!fs::exists(new_path, ec)) { |
669 | | - fs::rename(file.old_path, new_path, ec); |
670 | | - if (ec) { |
671 | | - fs::copy_file(file.old_path, new_path, ec); |
672 | | - if (ec) { |
673 | | - LOG_ERR("%s: failed to move/copy %s: %s\n", __func__, file.old_path.string().c_str(), ec.message().c_str()); |
674 | | - return false; |
675 | | - } |
676 | | - } |
677 | | - fs::remove(file.old_path, ec); |
678 | | - } |
679 | | - fs::remove(file.etag_path, ec); |
680 | | - |
681 | | - std::string filename = finalize_file(*file.file); |
682 | | - LOG_INF("%s: migrated %s -> %s\n", __func__, file.old_path.filename().string().c_str(), filename.c_str()); |
683 | | - return true; |
684 | | -} |
685 | | - |
686 | | -void migrate_old_cache_to_hf_cache(const std::string & token, bool offline) { |
687 | | - fs::path old_cache = fs_get_cache_directory(); |
688 | | - if (!fs::exists(old_cache)) { |
689 | | - return; |
690 | | - } |
691 | | - |
692 | | - if (offline) { |
693 | | - LOG_WRN("%s: skipping migration in offline mode (will run when online)\n", __func__); |
694 | | - return; // -hf is not going to work |
695 | | - } |
696 | | - |
697 | | - bool warned = false; |
698 | | - |
699 | | - for (const auto & entry : fs::directory_iterator(old_cache)) { |
700 | | - if (!entry.is_regular_file()) { |
701 | | - continue; |
702 | | - } |
703 | | - auto filename = entry.path().filename().string(); |
704 | | - auto [owner, repo] = parse_manifest_name(filename); |
705 | | - |
706 | | - if (owner.empty() || repo.empty()) { |
707 | | - continue; |
708 | | - } |
709 | | - |
710 | | - if (!warned) { |
711 | | - warned = true; |
712 | | - LOG_WRN("================================================================================\n" |
713 | | - "WARNING: Migrating cache to HuggingFace cache directory\n" |
714 | | - " Old cache: %s\n" |
715 | | - " New cache: %s\n" |
716 | | - "This one-time migration moves models previously downloaded with -hf\n" |
717 | | - "from the legacy llama.cpp cache to the standard HuggingFace cache.\n" |
718 | | - "Models downloaded with --model-url are not affected.\n" |
719 | | - "================================================================================\n", |
720 | | - old_cache.string().c_str(), get_cache_directory().string().c_str()); |
721 | | - } |
722 | | - |
723 | | - auto repo_id = owner + "/" + repo; |
724 | | - auto files = get_repo_files(repo_id, token); |
725 | | - |
726 | | - if (files.empty()) { |
727 | | - LOG_WRN("%s: could not get repo files for %s, skipping\n", __func__, repo_id.c_str()); |
728 | | - continue; |
729 | | - } |
730 | | - |
731 | | - migrate_files to_migrate; |
732 | | - bool ok = true; |
733 | | - |
734 | | - try { |
735 | | - std::ifstream manifest(entry.path()); |
736 | | - auto json = nl::json::parse(manifest); |
737 | | - for (const char * key : {"ggufFile", "mmprojFile"}) { |
738 | | - if (json.contains(key)) { |
739 | | - if (!collect_files(old_cache, owner, repo, json[key], files, to_migrate)) { |
740 | | - ok = false; |
741 | | - break; |
742 | | - } |
743 | | - } |
744 | | - } |
745 | | - } catch (const std::exception & e) { |
746 | | - LOG_WRN("%s: failed to parse manifest %s: %s\n", __func__, filename.c_str(), e.what()); |
747 | | - continue; |
748 | | - } |
749 | | - |
750 | | - if (!ok) { |
751 | | - LOG_WRN("%s: migration skipped: one or more files failed validation\n", __func__); |
752 | | - continue; |
753 | | - } |
754 | | - |
755 | | - for (const auto & file : to_migrate) { |
756 | | - if (!migrate_file(file)) { |
757 | | - ok = false; |
758 | | - break; |
759 | | - } |
760 | | - } |
761 | | - |
762 | | - if (!ok) { |
763 | | - LOG_WRN("%s: migration failed: could not migrate all files\n", __func__); |
764 | | - continue; |
765 | | - } |
766 | | - |
767 | | - LOG_INF("%s: migration complete, deleting manifest: %s\n", __func__, entry.path().string().c_str()); |
768 | | - fs::remove(entry.path()); |
769 | | - } |
770 | | -} |
771 | | - |
772 | 498 | } // namespace hf_cache |
0 commit comments