Collecting Data & Remove

Collecting Data from the Web with Python If you like

Remove duplicate image

1. Remove invalid images

  1. import os
  2. import sys
  3. import cv2
  4.  
  5. from collect_data import SUPPORTED_FORMATS
  6.  
  7. input_path = sys.argv[1]
  8.  
  9. for root, dirs, files in os.walk(input_path):
  10.     for filename in files:
  11.         ext = filename[filename.rfind('.')+1:].lower()
  12.         if ext not in SUPPORTED_FORMATS:
  13.             continue
  14.         filepath = os.sep.join([root, filename])
  15.         if cv2.imread(filepath) is None:
  16.             os.system('rm {}'.format(filepath))
  17.             print('{} is not a valid image file. Deleted!'.format(filepath))

2. Remove duplicate files

FDUPES is a program for identifying duplicate files residing
within specified directories.

  1. sudo apt-get install fdupes
  2. fdupes -rdN ./

3. Remove visually similar or duplicate images

3.1 Down scale

  1. import os
  2. import cv2
  3. import sys
  4.  
  5. input_path = sys.argv[1].rstrip(os.sep)
  6.  
  7. target_short_edge = int(sys.argv[2])
  8.  
  9. for root, dirs, files in os.walk(input_path):
  10.     print('scanning {} ...'.format(root))
  11.     for filename in files:
  12.         filepath = os.sep.join([root, filename])
  13.         img = cv2.imread(filepath)
  14.         h, w = img.shape[:2]
  15.         short_edge = min(w, h)
  16.  
  17.         if short_edge > target_short_edge:
  18.             scale = float(target_short_edge) / float(short_edge)
  19.             new_w = int(round(w * scale))
  20.             new_h = int(round(h * scale))
  21.             print('Down sampling {} from {} x {} to {} x {} ...'.format(
  22.                 filepath, w, w, new_w, new_h
  23.             ))
  24.             img = cv2.resize(img, (new_w, new_h))
  25.             cv2.imwrite(filepath, img)
  26. print('Done!')

3.2 Find image dupes

  1. sudo apt-get install findimagedupes
  2. findimagedupes -R train > dup_list

3.3 Remove duplicates

  1. import os
  2. import sys
  3. dup_list = sys.argv[1]
  4. with open(dup_list, 'r') as f:
  5.     lines = f.readlines()
  6.     for line in lines:
  7.         dups = line.split()
  8.         print('Removing duplicates of {}'.format(dups[0]))
  9.         for dup in dups[1:]:
  10.             cmd = 'rm {}'.format(dup)
  11.             os.system(cmd)

Leave a Reply

Your email address will not be published. Required fields are marked *