Module imdir

a simple python package to analyse a directory full of images

repository at : https://github.com/dhananjayraut/imdir/
documentation at : https://github.com/dhananjayraut/imdir/

features:

* Easy Interface to remember
* Analyse distribution of height, width, extensions etc.
* Multithreaded for faster execution
* high customizability for plots
* Detects corupt files
* well tested on diffrent platforms

example:

from imdir import image_dir

im_dir = image_dir(path="../input/train/",recursive=True, nthreads=4)

imdir.sc_plot(alpha=0.5) # plot height and width as scatter plot
Expand source code
"""
a simple python package to analyse a directory full of images

**repository at** : https://github.com/dhananjayraut/imdir/ <br>
**documentation at** : https://github.com/dhananjayraut/imdir/

features:
```
* Easy Interface to remember
* Analyse distribution of height, width, extensions etc.
* Multithreaded for faster execution
* high customizability for plots
* Detects corupt files
* well tested on diffrent platforms
```

example:
```
from imdir import image_dir

im_dir = image_dir(path="../input/train/",recursive=True, nthreads=4)

imdir.sc_plot(alpha=0.5) # plot height and width as scatter plot
```

"""

import os
import random
from PIL import Image
from multiprocessing import Pool, cpu_count
import matplotlib.pyplot as plt

_IMG_EXTENSIONS = (
    ".jpg",
    ".jpeg",
    ".png",
    ".ppm",
    ".bmp",
    ".pgm",
    ".tif",
    ".tiff",
    ".webp",
)


def _has_file_allowed_extension(filename, extensions):
    """Checks if a file is an allowed extension.
    Args:
        filename (string): path to a file
        extensions (tuple of strings): extensions to consider (lowercase)
    Returns:
        bool: True if the filename ends with one of given extensions
    """
    return filename.lower().endswith(extensions)


def _is_image_file(filename):
    """Checks if a file is an allowed image extension.
    Args:
        filename (string): path to a file
    Returns:
        bool: True if the filename ends with a known image extension
    """
    return _has_file_allowed_extension(filename, _IMG_EXTENSIONS)


def _list_files(path="./", recursive=False):
    """Returns the list of filepaths for image files in given folder
    Args:
        path (string): path to the folder
        recursive (Boolean): whether to discover all sub directories
    Returns:
       list: list of paths of all image files in that folder
    """
    if recursive is False:
        file_list = [
            os.path.join(path, f)
            for f in os.listdir(path)
            if os.path.isfile(os.path.join(path, f))
        ]
    else:
        file_list = []
        for folder, subs, files in os.walk(path):
            for filename in files:
                file_list.append(os.path.join(folder, filename))
    image_file_list = [f for f in file_list if _is_image_file(f)]
    return image_file_list


def _get_dimension(image_path):
    """ Gets the dimension of one image
    Args:
        image_path (string): path to the image
    Returns:
       list: list of paths of all image files in that folder
    """
    width, height = -1, -1
    try:
        width, height = list(Image.open(image_path).size)
    except:
        print("Error Ocured with file " + image_path)
    finally:
        return [width, height]


def _get_dimensions(image_path_list, nthreads=-1):
    """get dimensions of list of images
    Args:
       image_path_list (list): list containing paths of image files
    Returns:
       list: list of paths of all image files in that folder
    Note:- Returns -1, -1 for corrupt files.
    """

    if nthreads == 0:
        dims = [_get_dimension(f) for f in image_path_list]
    else:
        if nthreads == -1:
            nthreads = cpu_count()
        with Pool(nthreads) as p:
            dims = p.map(_get_dimension, image_path_list)
    return [f[0] for f in dims], [f[1] for f in dims]


def _get_extensions(image_path_list):
    """get extensions of list of images
    Args:
       image_path_list (list): list containing paths of image files
    Returns:
       list: list of extensions of all image files in that folder
    """
    return [str(f).split(".")[-1] for f in image_path_list]


def _correct_them(fl, wl, hl):
    """Checks for corrupt files using dimensions
    Args:
        fl (list): list containing paths of image files
        wl (list): list containing width of image files
        hl (list): list containing heights of image files
    Returns:
       list: list of extensions of all image files in that folder
    """
    file_list, corrupt_list = [], []
    width_list, height_list = [], []

    for i, width in enumerate(wl):
        if width == -1:
            corrupt_list.append(fl[i])
        else:
            width_list.append(wl[i])
            height_list.append(hl[i])
            file_list.append(fl[i])

    return file_list, corrupt_list, width_list, height_list


class image_dir:
    """
    image directory class

    """
    def __init__(self, path, recursive=False, nthreads=-1, sample_size=-1):
        """
        Args:
            path (str): path for the directory
            recursive (Boolean): whether to discover all sub directories
                                 default False
            nthreads (int): Number of Processes to use<br>
                            -1 (default) means use all cpu cores<br>
                            0  means do it in main thread (slow)
            sample_size (int): Number of images to consider <br>
                               -1 (default) means all images.
        Returns:
            a image_dir object
        Note: 
               for sample_size random sampling is applied before getting image  
               sizes hence some corrupt file may be present in the sample.
        """
        self.exten_counts = {}
        """(dict) extension counts dictionary"""
        self.file_list = []
        """(list) list of valid file paths"""
        self.corrupt_file_list = []
        """(list) list of corrupt file paths"""
        self.width_list = []
        """(list) list of width of images"""
        self.height_list = []
        """(list) list of height of images"""

        filelist = _list_files(path, recursive)
        if sample_size != -1:
            filelist = random.sample(filelist, sample_size)
        widthlist, heightlist = _get_dimensions(filelist, nthreads)
        fl, cl, wl, hl = _correct_them(filelist, widthlist, heightlist)
        self.file_list, self.corrupt_file_list = fl, cl
        self.width_list, self.height_list = wl, hl
        exten_list = _get_extensions(self.file_list)
        exts = list(set(set(exten_list)))
        self.exten_counts = {str(i): exten_list.count(i) for i in exts}
        return

    def sc_plot(self, **kwds):
        """
        Scatter plot for the image dimensions

        Args:
            **kwds: Additional keyword arguments to
                    matplotlib.pyplot 's scatter function
        """
        plt.scatter(self.width_list, self.height_list, **kwds)
        plt.show()

    def width_plot(self, **kwds):
        """
        histogram plot for the image widths.

        Args:
            **kwds: Additional keyword arguments to
                    matplotlib.pyplot 's hist function
        """
        plt.hist(self.width_list, **kwds)
        plt.show()

    def height_plot(self, **kwds):
        """
        histogram plot for the image heights.

        Args:
            **kwds: Additional keyword arguments to
                    matplotlib.pyplot 's hist function
        """
        plt.hist(self.height_list, **kwds)
        plt.show()

    def exten_plot(self, **kwds):
        """
        bar plot for the image extension counts.

        Args:
            **kwds: Additional keyword arguments to
                    matplotlib.pyplot 's bar function
        """
        exts = list(self.exten_counts.keys())
        counts = list(self.exten_counts.values())
        plt.bar(exts, counts, **kwds)
        plt.show()

    def plot_some_images(self, nrows, ncols, showpaths=False, **kwds):
        """
        plots random images from the directory in a grid.
        Args:
            nrows (int):  number of rows in grid.
            ncols (int):  number of columns in grid
            showpaths (bool):  whether to show paths as title
                               default False
            **kwds: Additional keyword arguments to
                    matplotlib.pyplot 's bar function
        """
        fig, axes = plt.subplots(nrows, ncols)
        images = random.sample(population=self.file_list, k=nrows*ncols)
        for i in range(nrows):
            for j in range(ncols):
                axes[i, j].imshow(Image.open(images[(i*ncols)+(j)]))
                if showpaths:
                    axes[i, j].set_title(images[(i*ncols)+(j)])
        plt.tight_layout()
        plt.show()

Classes

class image_dir (path, recursive=False, nthreads=-1, sample_size=-1)

image directory class

Args

path : str
path for the directory
recursive : Boolean
whether to discover all sub directories default False
nthreads : int
Number of Processes to use
-1 (default) means use all cpu cores
0 means do it in main thread (slow)
sample_size : int
Number of images to consider
-1 (default) means all images.

Returns

a image_dir object Note: for sample_size random sampling is applied before getting image
sizes hence some corrupt file may be present in the sample.

Expand source code
class image_dir:
    """
    image directory class

    """
    def __init__(self, path, recursive=False, nthreads=-1, sample_size=-1):
        """
        Args:
            path (str): path for the directory
            recursive (Boolean): whether to discover all sub directories
                                 default False
            nthreads (int): Number of Processes to use<br>
                            -1 (default) means use all cpu cores<br>
                            0  means do it in main thread (slow)
            sample_size (int): Number of images to consider <br>
                               -1 (default) means all images.
        Returns:
            a image_dir object
        Note: 
               for sample_size random sampling is applied before getting image  
               sizes hence some corrupt file may be present in the sample.
        """
        self.exten_counts = {}
        """(dict) extension counts dictionary"""
        self.file_list = []
        """(list) list of valid file paths"""
        self.corrupt_file_list = []
        """(list) list of corrupt file paths"""
        self.width_list = []
        """(list) list of width of images"""
        self.height_list = []
        """(list) list of height of images"""

        filelist = _list_files(path, recursive)
        if sample_size != -1:
            filelist = random.sample(filelist, sample_size)
        widthlist, heightlist = _get_dimensions(filelist, nthreads)
        fl, cl, wl, hl = _correct_them(filelist, widthlist, heightlist)
        self.file_list, self.corrupt_file_list = fl, cl
        self.width_list, self.height_list = wl, hl
        exten_list = _get_extensions(self.file_list)
        exts = list(set(set(exten_list)))
        self.exten_counts = {str(i): exten_list.count(i) for i in exts}
        return

    def sc_plot(self, **kwds):
        """
        Scatter plot for the image dimensions

        Args:
            **kwds: Additional keyword arguments to
                    matplotlib.pyplot 's scatter function
        """
        plt.scatter(self.width_list, self.height_list, **kwds)
        plt.show()

    def width_plot(self, **kwds):
        """
        histogram plot for the image widths.

        Args:
            **kwds: Additional keyword arguments to
                    matplotlib.pyplot 's hist function
        """
        plt.hist(self.width_list, **kwds)
        plt.show()

    def height_plot(self, **kwds):
        """
        histogram plot for the image heights.

        Args:
            **kwds: Additional keyword arguments to
                    matplotlib.pyplot 's hist function
        """
        plt.hist(self.height_list, **kwds)
        plt.show()

    def exten_plot(self, **kwds):
        """
        bar plot for the image extension counts.

        Args:
            **kwds: Additional keyword arguments to
                    matplotlib.pyplot 's bar function
        """
        exts = list(self.exten_counts.keys())
        counts = list(self.exten_counts.values())
        plt.bar(exts, counts, **kwds)
        plt.show()

    def plot_some_images(self, nrows, ncols, showpaths=False, **kwds):
        """
        plots random images from the directory in a grid.
        Args:
            nrows (int):  number of rows in grid.
            ncols (int):  number of columns in grid
            showpaths (bool):  whether to show paths as title
                               default False
            **kwds: Additional keyword arguments to
                    matplotlib.pyplot 's bar function
        """
        fig, axes = plt.subplots(nrows, ncols)
        images = random.sample(population=self.file_list, k=nrows*ncols)
        for i in range(nrows):
            for j in range(ncols):
                axes[i, j].imshow(Image.open(images[(i*ncols)+(j)]))
                if showpaths:
                    axes[i, j].set_title(images[(i*ncols)+(j)])
        plt.tight_layout()
        plt.show()

Instance variables

var corrupt_file_list

(list) list of corrupt file paths

var exten_counts

(dict) extension counts dictionary

var file_list

(list) list of valid file paths

var height_list

(list) list of height of images

var width_list

(list) list of width of images

Methods

def exten_plot(self, **kwds)

bar plot for the image extension counts.

Args

**kwds
Additional keyword arguments to matplotlib.pyplot 's bar function
Expand source code
def exten_plot(self, **kwds):
    """
    bar plot for the image extension counts.

    Args:
        **kwds: Additional keyword arguments to
                matplotlib.pyplot 's bar function
    """
    exts = list(self.exten_counts.keys())
    counts = list(self.exten_counts.values())
    plt.bar(exts, counts, **kwds)
    plt.show()
def height_plot(self, **kwds)

histogram plot for the image heights.

Args

**kwds
Additional keyword arguments to matplotlib.pyplot 's hist function
Expand source code
def height_plot(self, **kwds):
    """
    histogram plot for the image heights.

    Args:
        **kwds: Additional keyword arguments to
                matplotlib.pyplot 's hist function
    """
    plt.hist(self.height_list, **kwds)
    plt.show()
def plot_some_images(self, nrows, ncols, showpaths=False, **kwds)

plots random images from the directory in a grid.

Args

nrows : int
number of rows in grid.
ncols : int
number of columns in grid
showpaths : bool
whether to show paths as title default False
**kwds
Additional keyword arguments to matplotlib.pyplot 's bar function
Expand source code
def plot_some_images(self, nrows, ncols, showpaths=False, **kwds):
    """
    plots random images from the directory in a grid.
    Args:
        nrows (int):  number of rows in grid.
        ncols (int):  number of columns in grid
        showpaths (bool):  whether to show paths as title
                           default False
        **kwds: Additional keyword arguments to
                matplotlib.pyplot 's bar function
    """
    fig, axes = plt.subplots(nrows, ncols)
    images = random.sample(population=self.file_list, k=nrows*ncols)
    for i in range(nrows):
        for j in range(ncols):
            axes[i, j].imshow(Image.open(images[(i*ncols)+(j)]))
            if showpaths:
                axes[i, j].set_title(images[(i*ncols)+(j)])
    plt.tight_layout()
    plt.show()
def sc_plot(self, **kwds)

Scatter plot for the image dimensions

Args

**kwds
Additional keyword arguments to matplotlib.pyplot 's scatter function
Expand source code
def sc_plot(self, **kwds):
    """
    Scatter plot for the image dimensions

    Args:
        **kwds: Additional keyword arguments to
                matplotlib.pyplot 's scatter function
    """
    plt.scatter(self.width_list, self.height_list, **kwds)
    plt.show()
def width_plot(self, **kwds)

histogram plot for the image widths.

Args

**kwds
Additional keyword arguments to matplotlib.pyplot 's hist function
Expand source code
def width_plot(self, **kwds):
    """
    histogram plot for the image widths.

    Args:
        **kwds: Additional keyword arguments to
                matplotlib.pyplot 's hist function
    """
    plt.hist(self.width_list, **kwds)
    plt.show()