Last active
April 25, 2024 23:45
-
-
Save linuxtek-canada/1e48129d8a10160b86fb41ee215601df to your computer and use it in GitHub Desktop.
LocalAI - docker-compose fail - [llama-cpp] Fails: could not load model: rpc error: code = Unavailable desc = error reading from server: EOF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
version: '3.8' | |
services: | |
localai: | |
image: localai/localai:latest-aio-gpu-hipblas | |
deploy: | |
resources: | |
limits: | |
cpus: 8.0 | |
memory: 32G | |
environment: | |
- DEBUG=true | |
- REBUILD=true | |
- BUILD_TYPE=hipblas | |
- BUILD_GRPC_FOR_BACKEND_LLAMA=ON | |
- GPU_TARGETS=gfx1100 | |
- HSA_OVERRIDE_GFX_VERSION=10.3.0 | |
- CMAKE_BUILD_PARALLEL_LEVEL=16 | |
ports: | |
- "8080:8080" | |
volumes: | |
- ./models:/models | |
devices: | |
- /dev/dri | |
- /dev/kfd | |
security_opt: | |
- seccomp:unconfined |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[2Klocalai-1 | [90m1:37PM[0m [32mINF[0m Trying to load the model '5c7cd056ecf9a4bb5b527410b97f48cb' with all the available backends: llama-cpp, llama-ggml, gpt4all, bert-embeddings, rwkv, whisper, stablediffusion, tinydream, piper, /build/backend/python/coqui/run.sh, /build/backend/python/autogptq/run.sh, /build/backend/python/vall-e-x/run.sh, /build/backend/python/transformers/run.sh, /build/backend/python/bark/run.sh, /build/backend/python/diffusers/run.sh, /build/backend/python/exllama2/run.sh, /build/backend/python/exllama/run.sh, /build/backend/python/transformers-musicgen/run.sh, /build/backend/python/sentencetransformers/run.sh, /build/backend/python/sentencetransformers/run.sh, /build/backend/python/vllm/run.sh, /build/backend/python/mamba/run.sh, /build/backend/python/petals/run.sh | |
[2Klocalai-1 | [90m1:37PM[0m [32mINF[0m [llama-cpp] Attempting to load | |
[2Klocalai-1 | [90m1:37PM[0m [32mINF[0m Loading model '5c7cd056ecf9a4bb5b527410b97f48cb' with backend llama-cpp | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m Loading model in memory from file: /build/models/5c7cd056ecf9a4bb5b527410b97f48cb | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m Loading Model 5c7cd056ecf9a4bb5b527410b97f48cb with gRPC (file: /build/models/5c7cd056ecf9a4bb5b527410b97f48cb) (backend: llama-cpp): {backendString:llama-cpp model:5c7cd056ecf9a4bb5b527410b97f48cb threads:4 assetDir:/tmp/localai/backend_data context:{emptyCtx:{}} gRPCOptions:0xc000203400 externalBackends:map[autogptq:/build/backend/python/autogptq/run.sh bark:/build/backend/python/bark/run.sh coqui:/build/backend/python/coqui/run.sh diffusers:/build/backend/python/diffusers/run.sh exllama:/build/backend/python/exllama/run.sh exllama2:/build/backend/python/exllama2/run.sh huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh mamba:/build/backend/python/mamba/run.sh petals:/build/backend/python/petals/run.sh sentencetransformers:/build/backend/python/sentencetransformers/run.sh transformers:/build/backend/python/transformers/run.sh transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh vall-e-x:/build/backend/python/vall-e-x/run.sh vllm:/build/backend/python/vllm/run.sh] grpcAttempts:20 grpcAttemptsDelay:2 singleActiveBackend:false parallelRequests:false} | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m Loading GRPC Process: /tmp/localai/backend_data/backend-assets/grpc/llama-cpp | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC Service for 5c7cd056ecf9a4bb5b527410b97f48cb will be running at: '127.0.0.1:40971' | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC Service state dir: /tmp/go-processmanager434371092 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC Service Started | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stdout Server listening on 127.0.0.1:40971 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC Service Ready | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC: Loading model with options: {state:{NoUnkeyedLiterals:{} DoNotCompare:[] DoNotCopy:[] atomicMessageInfo:<nil>} sizeCache:0 unknownFields:[] Model:5c7cd056ecf9a4bb5b527410b97f48cb ContextSize:4096 Seed:774869951 NBatch:512 F16Memory:true MLock:false MMap:true VocabOnly:false LowVRAM:false Embeddings:false NUMA:false NGPULayers:99999999 MainGPU: TensorSplit: Threads:4 LibrarySearchPath: RopeFreqBase:0 RopeFreqScale:0 RMSNormEps:0 NGQA:0 ModelFile:/build/models/5c7cd056ecf9a4bb5b527410b97f48cb Device: UseTriton:false ModelBaseName: UseFastTokenizer:false PipelineType: SchedulerType: CUDA:false CFGScale:0 IMG2IMG:false CLIPModel: CLIPSubfolder: CLIPSkip:0 ControlNet: Tokenizer: LoraBase: LoraAdapter: LoraScale:0 NoMulMatQ:false DraftModel: AudioPath: Quantization: GPUMemoryUtilization:0 TrustRemoteCode:false EnforceEager:false SwapSpace:0 MaxModelLen:0 MMProj: RopeScaling: YarnExtFactor:0 YarnAttnFactor:0 YarnBetaFast:0 YarnBetaSlow:0 Type:} | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /build/models/5c7cd056ecf9a4bb5b527410b97f48cb (version GGUF V3 (latest)) | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 0: general.architecture str = llama | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 1: general.name str = jeffq | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 2: llama.context_length u32 = 32768 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 3: llama.embedding_length u32 = 4096 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 4: llama.block_count u32 = 32 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 5: llama.feed_forward_length u32 = 14336 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 7: llama.attention.head_count u32 = 32 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 8 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 10: llama.rope.freq_base f32 = 10000.000000 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 11: general.file_type u32 = 18 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 12: tokenizer.ggml.model str = llama | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,32032] = ["<unk>", "<s>", "</s>", "<0x00>", "<... | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 14: tokenizer.ggml.scores arr[f32,32032] = [0.000000, 0.000000, 0.000000, 0.0000... | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 15: tokenizer.ggml.token_type arr[i32,32032] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ... | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 1 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 32000 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 18: tokenizer.ggml.add_bos_token bool = true | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 19: tokenizer.ggml.add_eos_token bool = false | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 20: tokenizer.chat_template str = {% for message in messages %}{{'<|im_... | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - kv 21: general.quantization_version u32 = 2 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - type f32: 65 tensors | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llama_model_loader: - type q6_K: 226 tensors | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_vocab: special tokens definition check successful ( 291/32032 ). | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: format = GGUF V3 (latest) | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: arch = llama | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: vocab type = SPM | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: n_vocab = 32032 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: n_merges = 0 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: n_ctx_train = 32768 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: n_embd = 4096 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: n_head = 32 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: n_head_kv = 8 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: n_layer = 32 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: n_rot = 128 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: n_embd_head_k = 128 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: n_embd_head_v = 128 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: n_gqa = 4 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: n_embd_k_gqa = 1024 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: n_embd_v_gqa = 1024 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: f_norm_eps = 0.0e+00 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: f_norm_rms_eps = 1.0e-05 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: f_clamp_kqv = 0.0e+00 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: f_max_alibi_bias = 0.0e+00 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: f_logit_scale = 0.0e+00 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: n_ff = 14336 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: n_expert = 0 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: n_expert_used = 0 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: causal attn = 1 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: pooling type = 0 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: rope type = 0 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: rope scaling = linear | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: freq_base_train = 10000.0 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: freq_scale_train = 1 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: n_yarn_orig_ctx = 32768 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: rope_finetuned = unknown | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: ssm_d_conv = 0 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: ssm_d_inner = 0 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: ssm_d_state = 0 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: ssm_dt_rank = 0 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: model type = 7B | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: model ftype = Q6_K | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: model params = 7.24 B | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: model size = 5.53 GiB (6.56 BPW) | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: general.name = jeffq | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: BOS token = 1 '<s>' | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: EOS token = 32000 '<|im_end|>' | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: UNK token = 0 '<unk>' | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_print_meta: LF token = 13 '<0x0A>' | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr ggml_cuda_init: found 2 ROCm devices: | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr Device 0: Radeon RX 7900 XT, compute capability 10.3, VMM: no | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr Device 1: AMD Radeon Graphics, compute capability 10.3, VMM: no | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_tensors: ggml ctx size = 0.22 MiB | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_tensors: offloading 32 repeating layers to GPU | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_tensors: offloading non-repeating layers to GPU | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_tensors: offloaded 33/33 layers to GPU | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_tensors: ROCm0 buffer size = 5563.66 MiB | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:40971): stderr llm_load_tensors: CPU buffer size = 102.64 MiB | |
[2Klocalai-1 | [90m1:37PM[0m [32mINF[0m [llama-cpp] Fails: could not load model: rpc error: code = Unavailable desc = error reading from server: EOF | |
[2Klocalai-1 | [90m1:37PM[0m [32mINF[0m [llama-ggml] Attempting to load | |
[2Klocalai-1 | [90m1:37PM[0m [32mINF[0m Loading model '5c7cd056ecf9a4bb5b527410b97f48cb' with backend llama-ggml | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m Loading model in memory from file: /build/models/5c7cd056ecf9a4bb5b527410b97f48cb | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m Loading Model 5c7cd056ecf9a4bb5b527410b97f48cb with gRPC (file: /build/models/5c7cd056ecf9a4bb5b527410b97f48cb) (backend: llama-ggml): {backendString:llama-ggml model:5c7cd056ecf9a4bb5b527410b97f48cb threads:4 assetDir:/tmp/localai/backend_data context:{emptyCtx:{}} gRPCOptions:0xc000203400 externalBackends:map[autogptq:/build/backend/python/autogptq/run.sh bark:/build/backend/python/bark/run.sh coqui:/build/backend/python/coqui/run.sh diffusers:/build/backend/python/diffusers/run.sh exllama:/build/backend/python/exllama/run.sh exllama2:/build/backend/python/exllama2/run.sh huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh mamba:/build/backend/python/mamba/run.sh petals:/build/backend/python/petals/run.sh sentencetransformers:/build/backend/python/sentencetransformers/run.sh transformers:/build/backend/python/transformers/run.sh transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh vall-e-x:/build/backend/python/vall-e-x/run.sh vllm:/build/backend/python/vllm/run.sh] grpcAttempts:20 grpcAttemptsDelay:2 singleActiveBackend:false parallelRequests:false} | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m Loading GRPC Process: /tmp/localai/backend_data/backend-assets/grpc/llama-ggml | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC Service for 5c7cd056ecf9a4bb5b527410b97f48cb will be running at: '127.0.0.1:38217' | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC Service state dir: /tmp/go-processmanager4259228381 | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC Service Started | |
[2Klocalai-1 | [90m1:37PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:38217): stderr 2024/04/24 13:37:59 gRPC Server listening at 127.0.0.1:38217 | |
[2Klocalai-1 | [127.0.0.1]:50280 200 - GET /readyz | |
[2Klocalai-1 | [90m1:38PM[0m [33mDBG[0m GRPC Service Ready | |
[2Klocalai-1 | [90m1:38PM[0m [33mDBG[0m GRPC: Loading model with options: {state:{NoUnkeyedLiterals:{} DoNotCompare:[] DoNotCopy:[] atomicMessageInfo:<nil>} sizeCache:0 unknownFields:[] Model:5c7cd056ecf9a4bb5b527410b97f48cb ContextSize:4096 Seed:774869951 NBatch:512 F16Memory:true MLock:false MMap:true VocabOnly:false LowVRAM:false Embeddings:false NUMA:false NGPULayers:99999999 MainGPU: TensorSplit: Threads:4 LibrarySearchPath: RopeFreqBase:0 RopeFreqScale:0 RMSNormEps:0 NGQA:0 ModelFile:/build/models/5c7cd056ecf9a4bb5b527410b97f48cb Device: UseTriton:false ModelBaseName: UseFastTokenizer:false PipelineType: SchedulerType: CUDA:false CFGScale:0 IMG2IMG:false CLIPModel: CLIPSubfolder: CLIPSkip:0 ControlNet: Tokenizer: LoraBase: LoraAdapter: LoraScale:0 NoMulMatQ:false DraftModel: AudioPath: Quantization: GPUMemoryUtilization:0 TrustRemoteCode:false EnforceEager:false SwapSpace:0 MaxModelLen:0 MMProj: RopeScaling: YarnExtFactor:0 YarnAttnFactor:0 YarnBetaFast:0 YarnBetaSlow:0 Type:} | |
[2Klocalai-1 | [90m1:38PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:38217): stderr create_gpt_params: loading model /build/models/5c7cd056ecf9a4bb5b527410b97f48cb | |
[2Klocalai-1 | [90m1:38PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:38217): stderr llama.cpp: loading model from /build/models/5c7cd056ecf9a4bb5b527410b97f48cb | |
[2Klocalai-1 | [90m1:38PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:38217): stderr error loading model: unknown (magic, version) combination: 46554747, 00000003; is this really a GGML file? | |
[2Klocalai-1 | [90m1:38PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:38217): stderr llama_load_model_from_file: failed to load model | |
[2Klocalai-1 | [90m1:38PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:38217): stderr llama_init_from_gpt_params: error: failed to load model '/build/models/5c7cd056ecf9a4bb5b527410b97f48cb' | |
[2Klocalai-1 | [90m1:38PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:38217): stderr load_binding_model: error: unable to load model | |
[2Klocalai-1 | [90m1:38PM[0m [32mINF[0m [llama-ggml] Fails: could not load model: rpc error: code = Unknown desc = failed loading model | |
[2Klocalai-1 | [90m1:38PM[0m [32mINF[0m [gpt4all] Attempting to load | |
[2Klocalai-1 | [90m1:38PM[0m [32mINF[0m Loading model '5c7cd056ecf9a4bb5b527410b97f48cb' with backend gpt4all | |
[2Klocalai-1 | [90m1:38PM[0m [33mDBG[0m Loading model in memory from file: /build/models/5c7cd056ecf9a4bb5b527410b97f48cb | |
[2Klocalai-1 | [90m1:38PM[0m [33mDBG[0m Loading Model 5c7cd056ecf9a4bb5b527410b97f48cb with gRPC (file: /build/models/5c7cd056ecf9a4bb5b527410b97f48cb) (backend: gpt4all): {backendString:gpt4all model:5c7cd056ecf9a4bb5b527410b97f48cb threads:4 assetDir:/tmp/localai/backend_data context:{emptyCtx:{}} gRPCOptions:0xc000203400 externalBackends:map[autogptq:/build/backend/python/autogptq/run.sh bark:/build/backend/python/bark/run.sh coqui:/build/backend/python/coqui/run.sh diffusers:/build/backend/python/diffusers/run.sh exllama:/build/backend/python/exllama/run.sh exllama2:/build/backend/python/exllama2/run.sh huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh mamba:/build/backend/python/mamba/run.sh petals:/build/backend/python/petals/run.sh sentencetransformers:/build/backend/python/sentencetransformers/run.sh transformers:/build/backend/python/transformers/run.sh transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh vall-e-x:/build/backend/python/vall-e-x/run.sh vllm:/build/backend/python/vllm/run.sh] grpcAttempts:20 grpcAttemptsDelay:2 singleActiveBackend:false parallelRequests:false} | |
[2Klocalai-1 | [90m1:38PM[0m [33mDBG[0m Loading GRPC Process: /tmp/localai/backend_data/backend-assets/grpc/gpt4all | |
[2Klocalai-1 | [90m1:38PM[0m [33mDBG[0m GRPC Service for 5c7cd056ecf9a4bb5b527410b97f48cb will be running at: '127.0.0.1:39303' | |
[2Klocalai-1 | [90m1:38PM[0m [33mDBG[0m GRPC Service state dir: /tmp/go-processmanager3533884233 | |
[2Klocalai-1 | [90m1:38PM[0m [33mDBG[0m GRPC Service Started | |
[2Klocalai-1 | [90m1:38PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:39303): stderr 2024/04/24 13:38:01 gRPC Server listening at 127.0.0.1:39303 | |
[2Klocalai-1 | [90m1:38PM[0m [33mDBG[0m GRPC Service Ready | |
[2Klocalai-1 | [90m1:38PM[0m [33mDBG[0m GRPC: Loading model with options: {state:{NoUnkeyedLiterals:{} DoNotCompare:[] DoNotCopy:[] atomicMessageInfo:<nil>} sizeCache:0 unknownFields:[] Model:5c7cd056ecf9a4bb5b527410b97f48cb ContextSize:4096 Seed:774869951 NBatch:512 F16Memory:true MLock:false MMap:true VocabOnly:false LowVRAM:false Embeddings:false NUMA:false NGPULayers:99999999 MainGPU: TensorSplit: Threads:4 LibrarySearchPath:/tmp/localai/backend_data/backend-assets/gpt4all RopeFreqBase:0 RopeFreqScale:0 RMSNormEps:0 NGQA:0 ModelFile:/build/models/5c7cd056ecf9a4bb5b527410b97f48cb Device: UseTriton:false ModelBaseName: UseFastTokenizer:false PipelineType: SchedulerType: CUDA:false CFGScale:0 IMG2IMG:false CLIPModel: CLIPSubfolder: CLIPSkip:0 ControlNet: Tokenizer: LoraBase: LoraAdapter: LoraScale:0 NoMulMatQ:false DraftModel: AudioPath: Quantization: GPUMemoryUtilization:0 TrustRemoteCode:false EnforceEager:false SwapSpace:0 MaxModelLen:0 MMProj: RopeScaling: YarnExtFactor:0 YarnAttnFactor:0 YarnBetaFast:0 YarnBetaSlow:0 Type:} | |
[2Klocalai-1 | [90m1:38PM[0m [33mDBG[0m GRPC(5c7cd056ecf9a4bb5b527410b97f48cb-127.0.0.1:39303): stderr load_model: error 'Model format not supported (no matching implementation found)' | |
[2Klocalai-1 | [90m1:38PM[0m [32mINF[0m [gpt4all] Fails: could not load model: rpc error: code = Unknown desc = failed loading model |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Test Command: | |
curl http://localhost:8080/v1/chat/completions \ | |
-H "Content-Type: application/json" \ | |
-d '{ "model": "gpt-4", "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}] }' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment