diff --git a/main.py b/main.py
index 04f3e1f..f8b7900 100644
--- a/main.py
+++ b/main.py
@@ -13,6 +13,7 @@
 from routers.folder import router as folder_router
 from routers.checklist import router as checklist_router
 from routers.file import router as file_router
+from routers import qg_router
 
 
 # 1) 환경변수 로드 (상단에서 선 로드됨)
@@ -42,6 +43,7 @@
 app.include_router(folder_router)
 app.include_router(file_router)
 app.include_router(checklist_router)
+app.include_router(qg_router.router)
 
 @app.get("/")
 def root():
diff --git a/qg_answers.json b/qg_answers.json
new file mode 100644
index 0000000..92ff0af
--- /dev/null
+++ b/qg_answers.json
@@ -0,0 +1,29 @@
+{
+    "answers": [
+        {
+            "id": 1,
+            "question_preview": "다음 중 인공지능(AI)의 정의로 가장 적절한 것은 무엇인가요?...",
+            "correct_answer": "인간의 지능적 행동을 기계가 수행할 수 있도록 하는 기술"
+        },
+        {
+            "id": 2,
+            "question_preview": "지도학습(Supervised Learning)에서 학습에 사용되는 데이터는 어떤 특징을 가...",
+            "correct_answer": "입력과 정답이 함께 주어진 데이터를 이용하여 학습"
+        },
+        {
+            "id": 3,
+            "question_preview": "강화학습(Reinforcement Learning)에서 학습하는 주요 목표는 무엇인가요?...",
+            "correct_answer": "환경과 상호작용하며 보상을 최대화하는 행동 학습"
+        },
+        {
+            "id": 4,
+            "question_preview": "딥러닝(DL)의 핵심 기술은 무엇인가요?...",
+            "correct_answer": "인공신경망을 여러 층으로 깊게 쌓아 복잡한 패턴을 학습하는 기술"
+        },
+        {
+            "id": 5,
+            "question_preview": "대규모 언어모델(LLM)은 어떤 특징을 가지고 있나요?...",
+            "correct_answer": "수십억 개 이상의 파라미터를 가진 언어모델로, 다양한 언어 태스크에서 활용"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/qg_questions.json b/qg_questions.json
new file mode 100644
index 0000000..51e046a
--- /dev/null
+++ b/qg_questions.json
@@ -0,0 +1,54 @@
+{
+    "questions": [
+        {
+            "id": 1,
+            "question": "다음 중 인공지능(AI)의 정의로 가장 적절한 것은 무엇인가요?",
+            "options": [
+                "데이터를 저장하고 관리하는 기술",
+                "인간의 지능적 행동을 기계가 수행할 수 있도록 하는 기술",
+                "컴퓨터가 물리적인 작업을 자동으로 수행하는 방법",
+                "네트워크를 통해 정보를 전송하는 방식"
+            ]
+        },
+        {
+            "id": 2,
+            "question": "지도학습(Supervised Learning)에서 학습에 사용되는 데이터는 어떤 특징을 가지나요?",
+            "options": [
+                "정답 없이 데이터의 구조를 찾아내는 학습",
+                "입력과 정답이 함께 주어진 데이터를 이용하여 학습",
+                "환경과 상호작용하며 보상을 최대화하는 학습",
+                "복잡한 패턴을 학습하기 위해 여러 층의 신경망을 사용하는 학습"
+            ]
+        },
+        {
+            "id": 3,
+            "question": "강화학습(Reinforcement Learning)에서 학습하는 주요 목표는 무엇인가요?",
+            "options": [
+                "데이터를 분석하여 패턴을 발견하는 것",
+                "환경과 상호작용하며 보상을 최대화하는 행동 학습",
+                "객체를 인식하고 분류하는 것",
+                "자연어를 이해하고 생성하는 것"
+            ]
+        },
+        {
+            "id": 4,
+            "question": "딥러닝(DL)의 핵심 기술은 무엇인가요?",
+            "options": [
+                "자연어를 이해하고 생성하는 기술",
+                "이미지나 영상 데이터를 분석하는 기술",
+                "생성자와 판별자가 경쟁하며 데이터와 유사한 샘플을 생성하는 모델",
+                "인공신경망을 여러 층으로 깊게 쌓아 복잡한 패턴을 학습하는 기술"
+            ]
+        },
+        {
+            "id": 5,
+            "question": "대규모 언어모델(LLM)은 어떤 특징을 가지고 있나요?",
+            "options": [
+                "대규모 데이터로 일반적 표현을 학습한 후 특정 태스크에 맞게 미세조정",
+                "입력과 정답이 함께 주어진 데이터를 이용하여 학습",
+                "수십억 개 이상의 파라미터를 가진 언어모델로, 다양한 언어 태스크에서 활용",
+                "생성자와 판별자가 경쟁하며 데이터와 유사한 샘플을 생성하는 모델"
+            ]
+        }
+    ]
+}
\ No newline at end of file
diff --git a/routers/qg_router.py b/routers/qg_router.py
new file mode 100644
index 0000000..939a659
--- /dev/null
+++ b/routers/qg_router.py
@@ -0,0 +1,47 @@
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel, Field
+from typing import List
+import asyncio
+
+# qg_service.py에서 정의할 서비스 함수를 가져옵니다.
+from utils.qg_service import generate_questions_from_text
+
+# schemas에서 정의할 모델을 가져옵니다.
+from schemas import QuestionGenerationRequest, QuestionGenerationResponse, QuestionItem
+
+router = APIRouter(prefix="/qg", tags=["Question Generation"])
+
+@router.post(
+    "/generate",
+    response_model=QuestionGenerationResponse,
+    summary="요약 텍스트를 기반으로 예상 문제(객관식)를 생성합니다."
+)
+async def generate_questions_endpoint(request: QuestionGenerationRequest):
+    """
+    제공된 텍스트(주로 LLM 요약 결과)를 사용하여 예상 문제를 생성합니다.
+    """
+    if not request.text or len(request.text.strip()) < 100:
+        raise HTTPException(
+            status_code=400,
+            detail="문제 생성을 위한 텍스트 내용이 너무 짧거나 비어 있습니다. (최소 100자 이상 필요)"
+        )
+    
+    try:
+        # 비동기적으로 문제 생성 서비스 호출
+        questions = await generate_questions_from_text(
+            text=request.text,
+            num_questions=request.num_questions,
+            question_type=request.question_type,
+            language=request.language
+        )
+        
+        return QuestionGenerationResponse(
+            questions=questions
+        )
+    except Exception as e:
+        # 실제 환경에서는 로깅이 필요합니다.
+        print(f"문제 생성 중 오류 발생: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail="문제 생성 서비스 처리 중 서버 오류가 발생했습니다."
+        )
\ No newline at end of file
diff --git a/schemas/__init__.py b/schemas/__init__.py
index e6f8dd0..c4714ad 100644
--- a/schemas/__init__.py
+++ b/schemas/__init__.py
@@ -6,4 +6,10 @@
     GoogleLoginRequest, NaverLoginRequest,
     KakaoLoginRequest,
 )
+
+from .qg_schema import (
+    QuestionItem,
+    QuestionGenerationRequest,
+    QuestionGenerationResponse,
+)
  
\ No newline at end of file
diff --git a/schemas/qg_schema.py b/schemas/qg_schema.py
new file mode 100644
index 0000000..d18f290
--- /dev/null
+++ b/schemas/qg_schema.py
@@ -0,0 +1,26 @@
+from pydantic import BaseModel, Field
+from typing import List, Literal, Optional
+
+# =========================================================
+# 문제 생성(Question Generation, QG) 스키마
+# =========================================================
+
+class QuestionItem(BaseModel):
+    """생성된 문제 하나에 대한 데이터 모델"""
+    question: str = Field(..., description="질문 내용")
+    answer: str = Field(..., description="정답 내용")
+    # 객관식일 경우 4개의 보기, 주관식일 경우 빈 리스트
+    options: List[str] = Field(default_factory=list, description="질문 보기를 포함하는 리스트") 
+
+class QuestionGenerationRequest(BaseModel):
+    """문제 생성 요청 시 사용되는 입력 모델"""
+    text: str = Field(..., description="문제를 생성할 기반 텍스트 (LLM 요약 결과 등)")
+    num_questions: int = Field(5, description="생성할 문제 개수 (1-10)", ge=1, le=10)
+    question_type: Literal["multiple_choice", "short_answer"] = Field(
+        "multiple_choice", description="생성할 문제 유형"
+    )
+    language: Literal["ko", "en"] = Field("ko", description="텍스트의 언어 (ko 또는 en)")
+
+class QuestionGenerationResponse(BaseModel):
+    """문제 생성 결과 응답 모델"""
+    questions: List[QuestionItem] = Field(..., description="생성된 문제 리스트")
\ No newline at end of file
diff --git a/utils/qg_service.py b/utils/qg_service.py
new file mode 100644
index 0000000..20ca8f5
--- /dev/null
+++ b/utils/qg_service.py
@@ -0,0 +1,236 @@
+# utils/qg_service.py
+from __future__ import annotations
+import os, json, re, asyncio
+from typing import List, Literal, Optional, Dict, Any
+
+# HF Transformers
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+# schemas에서 정의할 모델을 가져옵니다.
+from schemas import QuestionItem 
+
+# =========================================================
+# 설정값 (Qwen3-4B-Instruct-2507 모델 사용)
+# =========================================================
+QG_MODEL_NAME = os.getenv("QG_MODEL_NAME", "Qwen/Qwen3-4B-Instruct-2507") 
+QG_MAX_NEW_TOKENS = int(os.getenv("QG_MAX_NEW_TOKENS", "4096"))
+QG_TEMPERATURE = float(os.getenv("QG_TEMPERATURE", "0.2"))
+HF_API_TOKEN = os.getenv("HF_API_TOKEN") or None
+
+_QG_MODEL = None
+_QG_TOKENIZER = None
+
+# =========================================================
+# 모델 로딩 유틸 (멀티 GPU 분산 및 CPU fallback)
+# =========================================================
+def _resolve_dtype():
+    """bf16 지원 시 bf16, 아니면 fp16 사용"""
+    if torch.cuda.is_available():
+        if torch.cuda.is_bf16_supported():
+            return torch.bfloat16
+        return torch.float16
+    return None
+
+
+def _available_gpus():
+    """사용 가능한 GPU 인덱스 리스트 반환"""
+    if not torch.cuda.is_available():
+        return []
+    return list(range(torch.cuda.device_count()))
+
+
+def _load_qg_model():
+    """Qwen3-4B 모델을 GPU 여러 개 및 CPU에 분산 로드합니다."""
+    global _QG_MODEL, _QG_TOKENIZER
+    if _QG_MODEL is not None and _QG_TOKENIZER is not None:
+        return _QG_MODEL, _QG_TOKENIZER
+
+    torch_dtype = _resolve_dtype()
+    load_in_4bit = os.getenv("HF_LOAD_IN_4BIT", "false").lower() in ("1", "true", "yes")
+
+    print(f"QG 모델 로딩 중: {QG_MODEL_NAME}, 4bit={load_in_4bit}")
+
+    hub_kwargs = {"trust_remote_code": True}
+    if HF_API_TOKEN:
+        hub_kwargs["token"] = HF_API_TOKEN
+
+    try:
+        # 1. 토크나이저 로드
+        _QG_TOKENIZER = AutoTokenizer.from_pretrained(QG_MODEL_NAME, **hub_kwargs)
+
+        # 2. 모델 로드 설정
+        kwargs = dict(hub_kwargs)
+        if load_in_4bit:
+            kwargs["load_in_4bit"] = True
+        if torch_dtype is not None:
+            kwargs["torch_dtype"] = torch_dtype
+
+        # 3. GPU 분산 메모리 설정
+        if torch.cuda.is_available():
+            gpu_list = _available_gpus()
+            print(f"사용 가능한 GPU 목록: {gpu_list}")
+
+            # GPU별 메모리 용량 비율로 분배 (0번은 적게, 2,3번은 많이)
+            max_memory = {}
+            for i in gpu_list:
+                if i == 0:
+                    max_memory[i] = "4GiB"   # 이미 다른 작업 중일 가능성
+                else:
+                    max_memory[i] = "20GiB"  # 여유 GPU는 풀로 사용
+            max_memory["cpu"] = "30GiB"
+
+            kwargs["max_memory"] = max_memory
+            device_map = "auto"
+
+            print(f"GPU 및 CPU 분산 배치 적용: {max_memory}")
+        else:
+            print("⚠️ CUDA 비활성화됨 — CPU에서 로드합니다.")
+            device_map = {"": "cpu"}
+
+        # 4. 모델 로드 (분산 배치 적용)
+        _QG_MODEL = AutoModelForCausalLM.from_pretrained(
+            QG_MODEL_NAME,
+            device_map=device_map,
+            **kwargs
+        )
+
+        print("✅ QG 모델 로드 완료.")
+        return _QG_MODEL, _QG_TOKENIZER
+
+    except torch.cuda.OutOfMemoryError as oom_e:
+        print(f"⚠️ GPU 메모리 부족 — CPU로 fallback합니다: {oom_e}")
+        _QG_MODEL = AutoModelForCausalLM.from_pretrained(
+            QG_MODEL_NAME,
+            device_map={"": "cpu"},
+            **hub_kwargs
+        )
+        return _QG_MODEL, _QG_TOKENIZER
+
+    except Exception as e:
+        print(f"QG 모델 로딩 실패: {e}")
+        raise RuntimeError(f"QG 모델 로딩 실패: {QG_MODEL_NAME} (오류 내용: {e})") from e
+
+
+# =========================================================
+# 텍스트 전처리 / 프롬프트 빌드
+# =========================================================
+def _extract_key_points(full_text: str) -> str:
+    """텍스트에서 '핵심 요점' 섹션만 추출합니다."""
+    match = re.search(
+        r"##\s*(핵심 요점|KEY POINTS|KEY TAKEAWAYS)\s*\n(.*?)(\n##\s*|\Z)", 
+        full_text, 
+        re.DOTALL | re.IGNORECASE
+    )
+    if match:
+        return "핵심 요점:\n" + match.group(2).strip()
+    print("⚠️ '핵심 요점' 섹션 추출 실패 — 전체 텍스트 기반으로 문제 생성.")
+    return full_text
+
+
+def _build_qg_prompt(tokenizer, system_text: str, user_text: str) -> str:
+    messages = [
+        {"role": "system", "content": system_text},
+        {"role": "user", "content": user_text},
+    ]
+    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+
+def _get_system_prompt(num: int, q_type: str, lang: str) -> str:
+    lang_name = "한국어" if lang == "ko" else "English"
+    q_type_desc = "객관식 (4지선다, 정답 포함)" if q_type == "multiple_choice" else "단답형 주관식"
+
+    return (
+        f"당신은 '{lang_name}'로 작성된 텍스트를 분석하여 전문가 수준의 교육용 문제를 생성하는 봇입니다. "
+        "입력 텍스트의 '핵심 요점' 섹션에 명시된 사실만을 근거로 문제를 생성해야 합니다. "
+        f"{num}개의 '{q_type_desc}' 문제를 생성하세요. "
+        "출력은 **반드시** JSON 배열 형식만 사용해야 합니다. 다른 설명이나 사족을 추가하지 마세요. "
+        
+        "\n\nJSON 형식: "
+        "[\n  {\n    \"question\": \"[질문 내용]\",\n    \"answer\": \"[정답]\",\n    \"options\": [\"[보기1]\", \"[보기2]\", \"[보기3]\", \"[보기4]\"]  // 객관식일 경우\n  }\n]"
+        "\n\n규칙:\n"
+        "1. 정답은 반드시 'options' 리스트 내에 포함되어야 합니다.\n"
+        "2. 질문은 명확하고, 정답은 '핵심 요점'에 기반해야 합니다.\n"
+        "3. 마크다운(```` )이나 다른 꾸밈 없이 순수 JSON 배열만 출력합니다."
+    )
+
+
+# =========================================================
+# 메인 문제 생성 함수
+# =========================================================
+async def generate_questions_from_text(
+    text: str,
+    num_questions: int,
+    question_type: Literal["multiple_choice", "short_answer"],
+    language: Literal["ko", "en"],
+) -> List[QuestionItem]:
+    
+    # 1. 모델 로드
+    try:
+        model, tokenizer = _load_qg_model()
+    except RuntimeError:
+        raise
+
+    # 2. 텍스트 전처리
+    key_points_only = _extract_key_points(text)
+
+    # 3. 프롬프트 구성
+    system_prompt = _get_system_prompt(num_questions, question_type, language)
+    user_payload = f"다음 '핵심 요점' 텍스트를 기반으로 문제를 생성합니다:\n\n---\n{key_points_only}"
+    prompt = _build_qg_prompt(tokenizer, system_prompt, user_payload)
+
+    # 4. LLM 추론
+    def _generate_sync():
+        inputs = tokenizer(prompt, return_tensors="pt")
+        if torch.cuda.is_available():
+            try:
+                target_device = next(model.parameters()).device
+                inputs = {k: v.to(target_device) for k, v in inputs.items()}
+            except Exception:
+                pass
+
+        gen_kwargs = dict(
+            max_new_tokens=QG_MAX_NEW_TOKENS,
+            do_sample=True,
+            temperature=QG_TEMPERATURE,
+            repetition_penalty=1.05,
+            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.eos_token_id or tokenizer.pad_token_id,
+        )
+
+        with torch.no_grad():
+            out = model.generate(**inputs, **gen_kwargs)
+        gen_ids = out[0, inputs["input_ids"].shape[1]:]
+        return tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
+
+    loop = asyncio.get_event_loop()
+    raw_json_output = await loop.run_in_executor(None, _generate_sync)
+
+    # 5. JSON 파싱 및 저장
+    try:
+        if raw_json_output.startswith("```"):
+            raw_json_output = re.sub(r"^```json\s*", "", raw_json_output, flags=re.IGNORECASE)
+            raw_json_output = re.sub(r"```\s*$", "", raw_json_output)
+
+        json_data: List[Dict[str, Any]] = json.loads(raw_json_output)
+        question_items = [QuestionItem(**item) for item in json_data]
+
+        # 파일 저장
+        questions_for_file = [{"id": i+1, "question": q.question, "options": q.options} for i, q in enumerate(question_items)]
+        answers_for_file = [{"id": i+1, "question_preview": q.question[:50] + "...", "correct_answer": q.answer} for i, q in enumerate(question_items)]
+
+        with open("qg_questions.json", "w", encoding="utf-8") as f:
+            json.dump({"questions": questions_for_file}, f, ensure_ascii=False, indent=4)
+        with open("qg_answers.json", "w", encoding="utf-8") as f:
+            json.dump({"answers": answers_for_file}, f, ensure_ascii=False, indent=4)
+
+        print("[INFO] 문제/답안 파일 저장 완료.")
+        return question_items
+
+    except json.JSONDecodeError as e:
+        print(f"JSON 파싱 오류: {e}\n원본 출력:\n{raw_json_output}")
+        raise ValueError(f"LLM 출력 JSON 파싱 실패: {raw_json_output[:100]}...")
+
+    except Exception as e:
+        print(f"최종 데이터 처리 오류: {e}")
+        raise