Commit 5ea88d17 authored by Aral Balkan's avatar Aral Balkan
Browse files

Enhancement: collate and use domains from new domain project

parent 6be79241
.DS_Store
node_modules
bak/
top-1m.csv
\ No newline at end of file
top-1m.csv
playground.coffee
\ No newline at end of file
This diff is collapsed.
......@@ -2,8 +2,8 @@
#
# Better Inspector
#
# Special Inquiry. Runs an inspection over a subset of the top
# 1,000,000 domains according to Alexa.
# Special Inquiry. Runs an inspection over all the currently-tracked
# domains (remote) or over the local HTTP Archive (local).
#
# This is Independent Technology.
#
......@@ -71,6 +71,7 @@ inquiryStatistics =
globalStatistics = GlobalStatistics.sharedInstance()
archivePath = path.join path.homedir(), 'better.fyi', 'archive'
domainsPath = path.join path.homedir(), 'better.fyi', 'domains'
domainsToInspect = 0
domainsInspected = 0
......@@ -222,36 +223,53 @@ prepareInspectionsForLocalInquiry = ->
#
# Returns a promise to read in the top 1 million domains
# from an Alexa CSV file and return an array of inspection
# promises for the number of domains requested.
# Load, process, and return a normalised list of domains.
#
prepareInspectionsForRemoteInquiry = ->
getNormalisedDomains = ->
_normalisedDomains = new Set()
fs.readdirAsync domainsPath
.then (files) ->
domains = (files.filter (file) -> file.match '.*csv$')
Promise.each domains, (domain) ->
# console.log "\nReading: #{domain}…\n"
fs.readFileAsync (path.join domainsPath, domain), 'utf-8'
.then (domainList) ->
(fs.readFileAsync './top-1m.csv', 'utf-8')
.then (data) ->
domains = data.split("\n")
domainListArray = (domainList.split "\n")
# console.log "Read domains."
# If the last line has a newline, remove the empty last item.
if domainListArray[domainListArray.length - 1] == ''
domainListArray.pop()
inspections = []
# Remove the domain index and any trailing fragments on the domain name
# (the Alexa lists have these, e.g., theguardian.com/uk)
domainListArray = domainListArray.map (d) ->
d.split(',')[1].split('/')[0]
domainIndex = domainIndexToStartAt
while domainIndex < domainIndexToStartAt + numberOfDomainsToAnalyse
domain = domains[domainIndex]
do (domain) ->
domainIndex++
domainBits = domain.split(',')
# Add the array to the set to remove duplicates.
domainListArray.forEach (domainName) -> _normalisedDomains.add domainName
.then ->
# console.log "Done:"
# console.log "#{_normalisedDomains.size} domains being tracked."
# console.log _normalisedDomains
return Array.from _normalisedDomains
inspection = -> inspect domainBits[1]
# _ promise.reflect()
inspections.push inspection
domainsToInspect = inspections.length
#
# Returns a promise to read in the top 1 million domains
# from an Alexa CSV file and return an array of inspection
# promises for the number of domains requested.
#
prepareInspectionsForRemoteInquiry = ->
return inspections
getNormalisedDomains()
.then (normalisedDomains) ->
domainsToInspect = normalisedDomains.length
return normalisedDomains
.catch (error) ->
throw new Error "Could not find top-1m.csv file: #{error}"
throw new Error "Could not prepare inspections for remote inquiry: #{error}"
#
# Decide which type of inspection to follow.
......
......@@ -20,7 +20,7 @@ This point cannot be stressed enough: **the manual investigation and editing pro
## Inquiry
Inquiry is a tool that runs inspections on batches of domains. Currently, we use it to analyse the [top five thousand domains from Alexa](https://source.ind.ie/better/inspector/blob/master/alexa-top-5000.csv).
Inquiry is a tool that runs inspections on [the domains currently being tracked by Better](https://source.ind.ie/better/domains).
## Installation
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment