bge_finetune/data/__init__.py
2025-07-22 16:55:25 +08:00

86 lines
3.1 KiB
Python

"""
Integrated data processing and augmentation module for BGE fine-tuning
Now with unified dataset pipeline supporting both embedding and reranker formats
"""
def __getattr__(name):
"""Lazy loading to avoid import errors for missing dependencies"""
# Core dataset classes - from integrated dataset.py
if name == 'BGEDataset':
from .dataset import BGEDataset
return BGEDataset
elif name == 'BGEM3Dataset':
from .dataset import BGEM3Dataset
return BGEM3Dataset
elif name == 'BGERerankerDataset':
from .dataset import BGERerankerDataset
return BGERerankerDataset
elif name == 'JointDataset':
from .dataset import JointDataset
return JointDataset
# Factory functions for dataset creation
elif name == 'create_embedding_dataset':
from .dataset import create_embedding_dataset
return create_embedding_dataset
elif name == 'create_reranker_dataset':
from .dataset import create_reranker_dataset
return create_reranker_dataset
# Data migration utilities
elif name == 'migrate_nested_to_flat':
from .dataset import migrate_nested_to_flat
return migrate_nested_to_flat
# Collate functions
elif name == 'collate_embedding_batch':
from .dataset import collate_embedding_batch
return collate_embedding_batch
elif name == 'collate_reranker_batch':
from .dataset import collate_reranker_batch
return collate_reranker_batch
# Data preprocessing and augmentation
elif name == 'DataPreprocessor':
from .preprocessing import DataPreprocessor
return DataPreprocessor
elif name == 'DataAugmenter':
from .preprocessing import DataAugmenter
return DataAugmenter
elif name == 'QueryAugmenter':
from .augmentation import QueryAugmenter
return QueryAugmenter
elif name == 'PassageAugmenter':
from .augmentation import PassageAugmenter
return PassageAugmenter
elif name == 'HardNegativeAugmenter':
from .augmentation import HardNegativeAugmenter
return HardNegativeAugmenter
elif name == 'CrossLingualAugmenter':
from .augmentation import CrossLingualAugmenter
return CrossLingualAugmenter
# Hard negative mining
elif name == 'ANCEHardNegativeMiner':
from .hard_negative_mining import ANCEHardNegativeMiner
return ANCEHardNegativeMiner
elif name == 'HardNegativeDataAugmenter':
from .hard_negative_mining import HardNegativeDataAugmenter
return HardNegativeDataAugmenter
# Backward compatibility - remove after migration
elif name == 'M3Dataset':
from .dataset import BGEM3Dataset
return BGEM3Dataset
elif name == 'RerankerDataset':
from .dataset import BGERerankerDataset
return BGERerankerDataset
elif name == 'NestedDataset':
# Deprecated - use BGEDataset with legacy_support=True
from .dataset import BGEDataset
return BGEDataset
else:
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")