|
| 1 | +import argparse |
1 | 2 | import os |
2 | 3 | import urllib |
3 | 4 | import numpy as np |
| 5 | +from PIL import Image |
4 | 6 |
|
5 | 7 | from joblib import Parallel, delayed |
6 | 8 |
|
7 | 9 |
|
8 | 10 | def download_image(download_str, save_dir): |
9 | 11 | img_name, img_url = download_str.strip().split('\t') |
10 | 12 | save_img = os.path.join(save_dir, "{}.jpg".format(img_name)) |
| 13 | + downloaded = False |
11 | 14 | try: |
12 | 15 | if not os.path.isfile(save_img): |
13 | 16 | print("Downloading {} to {}.jpg".format(img_url, img_name)) |
14 | 17 | urllib.urlretrieve(img_url, save_img) |
| 18 | + |
| 19 | + # Check size of the images |
| 20 | + downloaded = True |
| 21 | + with Image.open(save_img) as img: |
| 22 | + width, height = img.size |
| 23 | + if width < 500 or height < 500: |
| 24 | + os.remove(save_img) |
| 25 | + print("Remove downloaded images (w:{}, h:{})".format(width, height)) |
15 | 26 | else: |
16 | 27 | print("Already downloaded {}".format(save_img)) |
17 | 28 | except Exception: |
18 | | - print("File not exists.") |
| 29 | + if not downloaded: |
| 30 | + print("Cannot download.") |
| 31 | + else: |
| 32 | + os.remove(save_img) |
| 33 | + print("Remove failed, downloaded images.") |
19 | 34 |
|
20 | 35 |
|
21 | 36 | def main(): |
| 37 | + parser = argparse.ArgumentParser() |
| 38 | + parser.add_argument("--img_url_file", type=str, required=True, |
| 39 | + help="File that contains list of image IDs and urls.") |
| 40 | + parser.add_argument("--output_dir", type=str, required=True, |
| 41 | + help="Directory where to save outputs.") |
| 42 | + parser.add_argument("--n_download_urls", type=int, default=20000, |
| 43 | + help="Directory where to save outputs.") |
| 44 | + args = parser.parse_args() |
| 45 | + |
22 | 46 | np.random.seed(123456) |
23 | | - url_file = "/data/imagenet/fall11_urls.txt" |
24 | | - save_dir = "/data/imagenet/" |
25 | | - n_download_imgs = 20000 |
26 | 47 |
|
27 | | - with open(url_file) as f: |
| 48 | + with open(args.img_url_file) as f: |
28 | 49 | lines = f.readlines() |
29 | | - lines = np.random.choice(lines, size=n_download_imgs, replace=False) |
| 50 | + lines = np.random.choice(lines, size=args.n_download_urls, replace=False) |
30 | 51 |
|
31 | | - Parallel(n_jobs=12)(delayed(download_image)(line, save_dir) for line in lines) |
| 52 | + Parallel(n_jobs=12)(delayed(download_image)(line, args.output_dir) for line in lines) |
32 | 53 |
|
33 | 54 |
|
34 | 55 | if __name__ == "__main__": |
|
0 commit comments