init commit
This commit is contained in:
85
data/__init__.py
Normal file
85
data/__init__.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""
|
||||
Integrated data processing and augmentation module for BGE fine-tuning
|
||||
Now with unified dataset pipeline supporting both embedding and reranker formats
|
||||
"""
|
||||
|
||||
def __getattr__(name):
|
||||
"""Lazy loading to avoid import errors for missing dependencies"""
|
||||
|
||||
# Core dataset classes - from integrated dataset.py
|
||||
if name == 'BGEDataset':
|
||||
from .dataset import BGEDataset
|
||||
return BGEDataset
|
||||
elif name == 'BGEM3Dataset':
|
||||
from .dataset import BGEM3Dataset
|
||||
return BGEM3Dataset
|
||||
elif name == 'BGERerankerDataset':
|
||||
from .dataset import BGERerankerDataset
|
||||
return BGERerankerDataset
|
||||
elif name == 'JointDataset':
|
||||
from .dataset import JointDataset
|
||||
return JointDataset
|
||||
|
||||
# Factory functions for dataset creation
|
||||
elif name == 'create_embedding_dataset':
|
||||
from .dataset import create_embedding_dataset
|
||||
return create_embedding_dataset
|
||||
elif name == 'create_reranker_dataset':
|
||||
from .dataset import create_reranker_dataset
|
||||
return create_reranker_dataset
|
||||
|
||||
# Data migration utilities
|
||||
elif name == 'migrate_nested_to_flat':
|
||||
from .dataset import migrate_nested_to_flat
|
||||
return migrate_nested_to_flat
|
||||
|
||||
# Collate functions
|
||||
elif name == 'collate_embedding_batch':
|
||||
from .dataset import collate_embedding_batch
|
||||
return collate_embedding_batch
|
||||
elif name == 'collate_reranker_batch':
|
||||
from .dataset import collate_reranker_batch
|
||||
return collate_reranker_batch
|
||||
|
||||
# Data preprocessing and augmentation
|
||||
elif name == 'DataPreprocessor':
|
||||
from .preprocessing import DataPreprocessor
|
||||
return DataPreprocessor
|
||||
elif name == 'DataAugmenter':
|
||||
from .preprocessing import DataAugmenter
|
||||
return DataAugmenter
|
||||
elif name == 'QueryAugmenter':
|
||||
from .augmentation import QueryAugmenter
|
||||
return QueryAugmenter
|
||||
elif name == 'PassageAugmenter':
|
||||
from .augmentation import PassageAugmenter
|
||||
return PassageAugmenter
|
||||
elif name == 'HardNegativeAugmenter':
|
||||
from .augmentation import HardNegativeAugmenter
|
||||
return HardNegativeAugmenter
|
||||
elif name == 'CrossLingualAugmenter':
|
||||
from .augmentation import CrossLingualAugmenter
|
||||
return CrossLingualAugmenter
|
||||
|
||||
# Hard negative mining
|
||||
elif name == 'ANCEHardNegativeMiner':
|
||||
from .hard_negative_mining import ANCEHardNegativeMiner
|
||||
return ANCEHardNegativeMiner
|
||||
elif name == 'HardNegativeDataAugmenter':
|
||||
from .hard_negative_mining import HardNegativeDataAugmenter
|
||||
return HardNegativeDataAugmenter
|
||||
|
||||
# Backward compatibility - remove after migration
|
||||
elif name == 'M3Dataset':
|
||||
from .dataset import BGEM3Dataset
|
||||
return BGEM3Dataset
|
||||
elif name == 'RerankerDataset':
|
||||
from .dataset import BGERerankerDataset
|
||||
return BGERerankerDataset
|
||||
elif name == 'NestedDataset':
|
||||
# Deprecated - use BGEDataset with legacy_support=True
|
||||
from .dataset import BGEDataset
|
||||
return BGEDataset
|
||||
|
||||
else:
|
||||
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
|
||||
Reference in New Issue
Block a user