| import pandas as pd | |
| from Bio.Seq import Seq | |
| from Bio.SeqRecord import SeqRecord | |
| from Bio import SeqIO | |
| def stratified_sampling(df, sample_size=5000): | |
| label_counts = df['label'].value_counts() | |
| min_count = label_counts.min() | |
| sample_size = min(sample_size, min_count) | |
| sampled_df = df.groupby('label').apply(lambda x: x.sample(n=sample_size, random_state=42)).reset_index(drop=True) | |
| return sampled_df | |
| def fasta_to_df(fasta_file): | |
| unique_ids = [] | |
| species = [] | |
| sequence_lengths = [] | |
| labels = [] | |
| fragment_ids = [] | |
| sequences = [] | |
| for record in SeqIO.parse(fasta_file, "fasta"): | |
| unique_ids.append(record.description.split(' ')[0]) | |
| desc_parts = record.description.split(' ', 1)[1] if ' ' in record.description else '' | |
| try: | |
| desc_parts_dict = {part.split(':')[0].strip(): part.split(':')[1].strip() for part in desc_parts.split('|')} | |
| except Exception as e: | |
| print(f"Error parsing description for record {record.id}: {e}") | |
| continue | |
| species.append(desc_parts_dict.get('species')) | |
| sequence_lengths.append(int(desc_parts_dict.get('sequence_length', 0))) | |
| labels.append(desc_parts_dict.get('label')) | |
| sequences.append(str(record.seq)) | |
| df = pd.DataFrame({ | |
| 'unique_id': unique_ids, | |
| 'species': species, | |
| 'sequence_length': sequence_lengths, | |
| 'label': labels, | |
| 'sequence': sequences | |
| }) | |
| return df |