86 lines
3.1 KiB
Python
86 lines
3.1 KiB
Python
"""
|
|
Integrated data processing and augmentation module for BGE fine-tuning
|
|
Now with unified dataset pipeline supporting both embedding and reranker formats
|
|
"""
|
|
|
|
def __getattr__(name):
|
|
"""Lazy loading to avoid import errors for missing dependencies"""
|
|
|
|
# Core dataset classes - from integrated dataset.py
|
|
if name == 'BGEDataset':
|
|
from .dataset import BGEDataset
|
|
return BGEDataset
|
|
elif name == 'BGEM3Dataset':
|
|
from .dataset import BGEM3Dataset
|
|
return BGEM3Dataset
|
|
elif name == 'BGERerankerDataset':
|
|
from .dataset import BGERerankerDataset
|
|
return BGERerankerDataset
|
|
elif name == 'JointDataset':
|
|
from .dataset import JointDataset
|
|
return JointDataset
|
|
|
|
# Factory functions for dataset creation
|
|
elif name == 'create_embedding_dataset':
|
|
from .dataset import create_embedding_dataset
|
|
return create_embedding_dataset
|
|
elif name == 'create_reranker_dataset':
|
|
from .dataset import create_reranker_dataset
|
|
return create_reranker_dataset
|
|
|
|
# Data migration utilities
|
|
elif name == 'migrate_nested_to_flat':
|
|
from .dataset import migrate_nested_to_flat
|
|
return migrate_nested_to_flat
|
|
|
|
# Collate functions
|
|
elif name == 'collate_embedding_batch':
|
|
from .dataset import collate_embedding_batch
|
|
return collate_embedding_batch
|
|
elif name == 'collate_reranker_batch':
|
|
from .dataset import collate_reranker_batch
|
|
return collate_reranker_batch
|
|
|
|
# Data preprocessing and augmentation
|
|
elif name == 'DataPreprocessor':
|
|
from .preprocessing import DataPreprocessor
|
|
return DataPreprocessor
|
|
elif name == 'DataAugmenter':
|
|
from .preprocessing import DataAugmenter
|
|
return DataAugmenter
|
|
elif name == 'QueryAugmenter':
|
|
from .augmentation import QueryAugmenter
|
|
return QueryAugmenter
|
|
elif name == 'PassageAugmenter':
|
|
from .augmentation import PassageAugmenter
|
|
return PassageAugmenter
|
|
elif name == 'HardNegativeAugmenter':
|
|
from .augmentation import HardNegativeAugmenter
|
|
return HardNegativeAugmenter
|
|
elif name == 'CrossLingualAugmenter':
|
|
from .augmentation import CrossLingualAugmenter
|
|
return CrossLingualAugmenter
|
|
|
|
# Hard negative mining
|
|
elif name == 'ANCEHardNegativeMiner':
|
|
from .hard_negative_mining import ANCEHardNegativeMiner
|
|
return ANCEHardNegativeMiner
|
|
elif name == 'HardNegativeDataAugmenter':
|
|
from .hard_negative_mining import HardNegativeDataAugmenter
|
|
return HardNegativeDataAugmenter
|
|
|
|
# Backward compatibility - remove after migration
|
|
elif name == 'M3Dataset':
|
|
from .dataset import BGEM3Dataset
|
|
return BGEM3Dataset
|
|
elif name == 'RerankerDataset':
|
|
from .dataset import BGERerankerDataset
|
|
return BGERerankerDataset
|
|
elif name == 'NestedDataset':
|
|
# Deprecated - use BGEDataset with legacy_support=True
|
|
from .dataset import BGEDataset
|
|
return BGEDataset
|
|
|
|
else:
|
|
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
|