#StackBounty: #go Go program to find duplicated files in a directory (recursively)

Bounty: 50

This is my first Go program. I’m learning the language but it’s a bit difficult to understand all the concepts so in order to practice I wrote this. It’s a simple program which recursively check for duplicated files in a directory.

It uses a SHA256 hash on files in order to identify if two files are the same or not. I spawn multiple workers to handle this hashing.

Here is how it works:

  • n workers (goroutine) are spawned, each of them waiting for file paths to process on the same channel, named input in my code.
  • 1 goroutine is spawned to recursively search for files in the direvtory, and populate the input channel with file names.
  • The main goroutine process the results as soon as they are available and add them to a map of sha256->[file, file, …].

Finally we just display the duplicates.

Please feel to comment on anything, I really want to progress in Go, and especially “idiomatic” Go.

EDIT: Improved my initial code with flags and error management.

package main

import (
    "crypto/sha256"
    "encoding/hex"
    "fmt"
    "os"
    "path/filepath"
    "sync"
    "flag"
    "runtime"
    "io"
)

var dir string
var workers int

type Result struct {
    file   string
    sha256 [32]byte
}

func worker(input chan string, results chan<- *Result, wg *sync.WaitGroup) {
    for file := range input {
        var h = sha256.New()
        var sum [32]byte
        f, err := os.Open(file)
        if err != nil {
            fmt.Fprintln(os.Stderr, err)
            continue
        }
        if _, err = io.Copy(h, f); err != nil {
            fmt.Fprintln(os.Stderr, err)
            f.Close()
            continue
        }
        f.Close()
        copy(sum[:], h.Sum(nil))
        results <- &Result{
            file:   file,
            sha256: sum,
        }
    }
    wg.Done()
}

func search(input chan string) {
    filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
        if err != nil {
            fmt.Fprintln(os.Stderr, err)
        } else if info.Mode().IsRegular() {
            input <- path
        }
        return nil
    })
    close(input)
}

func main() {

    flag.StringVar(&dir, "dir", ".", "directory to search")
    flag.IntVar(&workers, "workers", runtime.NumCPU(), "number of workers")
    flag.Parse()

    fmt.Printf("Searching in %s using %d workers...n", dir, workers)

    input := make(chan string)
    results := make(chan *Result)

    wg := sync.WaitGroup{}
    wg.Add(workers)

    for i := 0; i < workers; i++ {
        go worker(input, results, &wg)
    }

    go search(input)
    go func() {
        wg.Wait()
        close(results)
    }()

    counter := make(map[[32]byte][]string)
    for result := range results {
        counter[result.sha256] = append(counter[result.sha256], result.file)
    }

    for sha, files := range counter {
        if len(files) > 1 {
            fmt.Printf("Found %d duplicates for %s: n", len(files), hex.EncodeToString(sha[:]))
            for _, f := range files {
                fmt.Println("-> ", f)
            }
        }
    }

}


Get this bounty!!!

Leave a Reply