From 1a827925eb9d943b2da8c75d8cf008c243eb2772 Mon Sep 17 00:00:00 2001
From: ayush chaurasia <ayush.chaurarsia@gmail.com>
Date: Tue, 16 Apr 2024 08:59:36 +0530
Subject: [PATCH] update docs

---
 .../embeddings/fine_tuner/basetuner.py        |  10 +-
 .../lancedb/embeddings/fine_tuner/dataset.py  | 127 ++++++++++++++++--
 2 files changed, 123 insertions(+), 14 deletions(-)

diff --git a/python/python/lancedb/embeddings/fine_tuner/basetuner.py b/python/python/lancedb/embeddings/fine_tuner/basetuner.py
index a90be05a..eddad90d 100644
--- a/python/python/lancedb/embeddings/fine_tuner/basetuner.py
+++ b/python/python/lancedb/embeddings/fine_tuner/basetuner.py
@@ -6,8 +6,14 @@ class BaseEmbeddingTuner(ABC):
 
     @abstractmethod
     def finetune(self) -> None:
-        """Goes off and does stuff."""
+        """
+        Finetune the embedding model.
+        """
+        pass
 
     def helper(self) -> None:
-        """A helper method."""
+        """
+        A helper method called after finetuning. This is meant to provide
+        usage instructions or other helpful information.
+        """
         pass
diff --git a/python/python/lancedb/embeddings/fine_tuner/dataset.py b/python/python/lancedb/embeddings/fine_tuner/dataset.py
index 6d01abff..c90ba6b6 100644
--- a/python/python/lancedb/embeddings/fine_tuner/dataset.py
+++ b/python/python/lancedb/embeddings/fine_tuner/dataset.py
@@ -52,7 +52,17 @@ class QADataset(BaseModel):
         ]
 
     def save(self, path: str, mode: str = "overwrite") -> None:
-        """Save to lance dataset"""
+        """
+        Save the current dataset to a directory as .lance files.
+
+        Parameters
+        ----------
+        path : str
+            The path to save the dataset.
+        mode : str, optional
+            The mode to save the dataset, by default "overwrite". Accepts
+            lance modes.
+        """
         save_dir = Path(path)
         save_dir.mkdir(parents=True, exist_ok=True)
 
@@ -87,7 +97,20 @@ class QADataset(BaseModel):
 
     @classmethod
     def load(cls, path: str) -> "QADataset":
-        """Load from .lance data"""
+        """
+        Load QADataset from a directory.
+
+        Parameters
+        ----------
+        path : str
+            The path to load the dataset from.
+        
+        Returns
+        -------
+        QADataset
+            The loaded QADataset.
+        
+        """
         load_dir = Path(path)
         queries = lance.dataset(load_dir / "queries.lance").to_table().to_pydict()
         corpus = lance.dataset(load_dir / "corpus.lance").to_table().to_pydict()
@@ -109,7 +132,25 @@ class QADataset(BaseModel):
         qa_generate_prompt_tmpl: str = DEFAULT_PROMPT_TMPL,
         num_questions_per_chunk: int = 2,
     ) -> "QADataset":
-        """Generate examples given a set of nodes."""
+        """
+        Generate a QADataset from a list of TextChunks. 
+        
+        Parameters
+        ----------
+        nodes : List[TextChunk]
+            The list of text chunks.
+        llm : BaseLLM
+            The language model to generate questions.
+        qa_generate_prompt_tmpl : str, optional
+            The template for generating questions, by default DEFAULT_PROMPT_TMPL.
+        num_questions_per_chunk : int, optional
+            The number of questions to generate per chunk, by default 2.
+
+        Returns
+        -------
+        QADataset
+            The generated QADataset.
+        """
         node_dict = {node.id: node.text for node in nodes}
 
         queries = {}
@@ -130,22 +171,46 @@ class QADataset(BaseModel):
                 queries[question_id] = question
                 relevant_docs[question_id] = [node_id]
 
-        return QADataset(queries=queries, corpus=node_dict, relevant_docs=relevant_docs)
+        return cls(queries=queries, corpus=node_dict, relevant_docs=relevant_docs)
 
     @classmethod
     def from_responses(
         cls,
-        docs: List["TextChunk"],
+        nodes: List["TextChunk"],
         queries: Dict[str, str],
         relevant_docs: Dict[str, List[str]],
     ) -> "QADataset":
-        """Create a QADataset from a list of TextChunks and a list of questions."""
-        node_dict = {node.id: node.text for node in docs}
+        """
+        Create a QADataset from a list of TextChunks and a list of questions, queries, and relevant docs.
+
+        Parameters
+        ----------
+        nodes : List[TextChunk]
+            The list of text chunks.
+        queries : Dict[str, str]
+            The queries. query id -> query.
+        relevant_docs : Dict[str, List[str]]
+            The relevant docs. Dict query id -> list of doc ids.
+        
+        Returns
+        -------
+        QADataset
+            The QADataset.
+        """
+        node_dict = {node.id: node.text for node in nodes}
         return cls(queries=queries, corpus=node_dict, relevant_docs=relevant_docs)
 
 
 class TextChunk(BaseModel):
-    """Simple text chunk for generating questions."""
+    """
+    Simple text chunk for storing text nodes. Acts as a wrapper around text.
+    Allow interoperability between different text processing libraries.
+
+    Args:
+        text (str): The text of the chunk.
+        id (str): The id of the chunk.
+        metadata (Dict[str, Any], optional): The metadata of the chunk. Defaults to {}.
+    """
 
     text: str
     id: str
@@ -153,22 +218,60 @@ class TextChunk(BaseModel):
 
     @classmethod
     def from_chunk(cls, chunk: str, metadata: dict = {}) -> "TextChunk":
-        """Create a SimpleTextChunk from a chunk."""
+        """
+        Create a SimpleTextChunk from a chunk.
+
+        Parameters
+        ----------
+        chunk : str
+            The text chunk.
+        metadata : dict, optional
+            The metadata, by default {}.
+
+        Returns
+        -------
+        TextChunk
+            The text chunk.
+        
+        """
         # generate a unique id
         return cls(text=chunk, id=str(uuid.uuid4()), metadata=metadata)
 
     @classmethod
     def from_llama_index_node(cls, node):
-        """Convert a llama index node to a text chunk."""
+        """
+        Generate a TextChunk from a llama index node.
+
+        Parameters
+        ----------
+        node : llama_index.core.TextNode
+            The llama index node.
+        
+        """
         return cls(text=node.text, id=node.node_id, metadata=node.metadata)
 
     @classmethod
     def from_langchain_node(cls, node):
-        """Convert a langchaain node to a text chunk."""
+        """
+        Generate a TextChunk from a langchain node.
+
+        Parameters
+        ----------
+        node : langchain.core.TextNode
+            The langchain node.
+        
+        """
         raise NotImplementedError("Not implemented yet.")
 
     def to_dict(self) -> Dict[str, Any]:
-        """Convert to a dictionary."""
+        """
+        Convert to a dictionary.
+        
+        Returns
+        -------
+        Dict[str, Any]
+            The dictionary.
+        """
         return self.dict()
 
     def __str__(self) -> str: