Ticket #17792: filter_pdbs.py

File filter_pdbs.py, 1.4 KB (added by Roden Deng Luo, 5 months ago)

Added by email2trac

Line 
1#!/usr/bin/env python3
2import os
3import shutil
4import argparse
5
6def is_well_formatted(pdb_path):
7 """
8 Returns True if no atom serial number (columns 7–11) is
9 repeated in the ATOM/HETATM records of this PDB.
10 """
11 seen = set()
12 with open(pdb_path, 'r') as f:
13 for line in f:
14 if line.startswith(('ATOM ', 'HETATM')):
15 try:
16 serial = int(line[6:11])
17 except ValueError:
18 # malformed line or non-integer serial → reject
19 return False
20 if serial in seen:
21 return False
22 seen.add(serial)
23 return True
24
25def main():
26 parser = argparse.ArgumentParser(
27 description="Keep only well-formed PDBs (no duplicate atom serials)."
28 )
29 parser.add_argument('pdb_dir',
30 help="Directory containing .pdb files to filter")
31 args = parser.parse_args()
32 pdb_dir = os.path.abspath(args.pdb_dir)
33
34 bad_dir = os.path.join(pdb_dir, 'bad')
35 os.makedirs(bad_dir, exist_ok=True)
36
37 for fn in os.listdir(pdb_dir):
38 if not fn.lower().endswith('.pdb'):
39 continue
40 full = os.path.join(pdb_dir, fn)
41 if not is_well_formatted(full):
42 print(f"✗ malformed: {fn}")
43 shutil.move(full, os.path.join(bad_dir, fn))
44 else:
45 print(f"✓ OK: {fn}")
46
47if __name__ == '__main__':
48 main()
49