2018-06-25 05:00:12 +00:00
#!/usr/bin/env node
2022-02-05 01:17:01 +00:00
const fs = require ( "fs" ) ;
const path = require ( "path" ) ;
const util = require ( "util" ) ;
2018-06-25 05:00:12 +00:00
2022-02-05 01:17:01 +00:00
const cheerio = require ( "cheerio" ) ;
const lunr = require ( "lunr" ) ;
const marked = require ( "marked" ) ;
2018-06-25 05:00:12 +00:00
/ * *
* Constants
* /
2022-02-05 01:17:01 +00:00
const headingsRequired = [ "Company blurb" ] ;
2018-06-25 05:00:12 +00:00
const headingsOptional = [
2022-02-05 01:17:01 +00:00
"Company size" ,
"Remote status" ,
"Region" ,
"Company technologies" ,
"Office locations" ,
"How to apply" ,
2018-06-25 05:00:12 +00:00
] ;
2022-02-05 01:17:01 +00:00
const headingsAll = headingsRequired . concat ( headingsOptional ) ;
2018-06-25 05:00:12 +00:00
/ * *
* Utility functions
* /
2022-02-05 01:17:01 +00:00
function companyNameToProfileFilename ( companyName ) {
return companyName
. toLowerCase ( )
. replace ( /&/g , " and " )
. replace ( /'/g , "" )
. replace ( /[^a-z0-9]+/gi , "-" )
. replace ( /^-|-$/g , "" ) ;
2018-06-25 05:00:12 +00:00
}
exports . companyNameToProfileFilename = companyNameToProfileFilename ;
// adapted from https://gist.github.com/RandomEtc/2657669
2022-02-05 01:17:01 +00:00
function jsonStringifyUnicodeEscaped ( obj ) {
return JSON . stringify ( obj ) . replace ( /[\u007f-\uffff]/g , ( c ) => {
return "\\u" + ( "0000" + c . charCodeAt ( 0 ) . toString ( 16 ) ) . slice ( - 4 ) ;
} ) ;
2018-06-25 05:00:12 +00:00
}
exports . jsonStringifyUnicodeEscaped = jsonStringifyUnicodeEscaped ;
2022-02-05 01:17:01 +00:00
function toIdentifierCase ( text ) {
return text
. replace ( /'/g , "" )
. replace ( /[^a-z0-9]+/gi , " " )
. trim ( )
. split ( /\s+/ )
. map ( ( word , i ) => {
if ( i === 0 ) {
return word . toLowerCase ( ) ;
}
return word . substr ( 0 , 1 ) . toUpperCase ( ) + word . substr ( 1 ) . toLowerCase ( ) ;
} )
. join ( "" ) ;
2018-06-25 07:16:13 +00:00
}
exports . toIdentifierCase = toIdentifierCase ;
2022-02-05 01:17:01 +00:00
function stripExtraChars ( text ) {
return text . replace ( /\ufe0f/g , "" ) ;
2018-06-25 07:16:13 +00:00
}
exports . stripExtraChars = stripExtraChars ;
2018-08-12 01:41:11 +00:00
/ * *
* Other exports
* /
2020-05-06 06:42:21 +00:00
function getHeadingPropertyNames ( ) {
2022-02-05 01:17:01 +00:00
return headingsAll . reduce ( ( acc , val ) => {
acc [ toIdentifierCase ( val ) ] = val ;
return acc ;
} , { } ) ;
2020-05-06 06:42:21 +00:00
}
exports . headingPropertyNames = getHeadingPropertyNames ( ) ;
2018-08-12 01:41:11 +00:00
2018-06-25 05:00:12 +00:00
/ * *
* The main exported function
*
* Start with a directory including a README . md and company - profiles / * . md
* files , and validate and parse the content of the Markdown files .
* /
2022-02-05 01:17:01 +00:00
exports . parseFromDirectory = ( contentPath ) => {
const companyNamesSeen = { } ;
let errors = [ ] ;
function error ( filename , msg , ... params ) {
errors . push ( {
filename ,
message : util . format ( msg , ... params ) ,
} ) ;
}
// Build list of Markdown files containing company profiles.
const profilesPath = path . join ( contentPath , "company-profiles" ) ;
const profileFilenames = fs . readdirSync ( profilesPath ) ;
// Scan the company table in the readme.
const readmeCompanies = [ ] ;
const readmeMarkdown = stripExtraChars (
fs . readFileSync ( path . join ( contentPath , "README.md" ) , "utf8" )
) ;
let inTable = false ;
readmeMarkdown . split ( "\n" ) . forEach ( ( line ) => {
if ( /^\s*-+\s*\|\s*-+\s*\|\s*-+\s*$/ . test ( line ) ) {
inTable = true ;
} else if ( /^\s*$/ . test ( line ) ) {
inTable = false ;
} else if ( inTable ) {
const fields = line . split ( "|" ) ;
if ( fields . length !== 3 ) {
readmeError (
"Expected 3 table cells but found %d: %s" ,
fields . length ,
line
) ;
}
}
} ) ;
const $ = cheerio . load ( marked . parse ( readmeMarkdown ) ) ;
function readmeError ( msg , ... params ) {
error ( "README.md" , msg , ... params ) ;
}
const mainUrl = "remoteintech.company" ;
function addTargetBlankAndExternalLinkIcons ( el ) {
if ( el . type === "tag" ) {
const anchorTagElements = el . children . filter (
( element ) => element . name === "a"
) ;
if ( anchorTagElements . length > 0 ) {
anchorTagElements . forEach ( ( element ) => {
const url = element . attribs . href ;
const urlInfo = getUrlInfo ( url ) ;
if ( urlInfo . is _email || urlInfo . is _internal ) {
return ;
}
element . attribs . target = "_blank" ;
$element = $ ( element ) ;
$element . append (
'<span style="vertical-align: text-top;"> <img src="/assets/external-link.svg" /> </span>'
) ;
} ) ;
}
if ( el . children && el . children . length ) {
el . children . forEach ( ( element ) => {
addTargetBlankAndExternalLinkIcons ( element ) ;
} ) ;
}
}
}
/ * *
* Getting info about the url . It includes checking isEmail of isInternal
* @ param { * } url
* /
function getUrlInfo ( url ) {
const data = { } ;
if ( url . match ( /^mailto:/ ) ) {
// checking url email or not
data . is _email = true ;
return data ;
}
const mainDomainFromGivenUrl = extractMainDomainFromUrl ( url ) ;
// checking url is email or not
if ( mainDomainFromGivenUrl !== mainUrl ) {
data . is _internal = false ;
return data ;
} else {
data . is _internal = true ;
}
return data ;
}
/ * *
* Extracting main domain from the url
* @ param { * } url
* /
function extractMainDomainFromUrl ( url ) {
const domainRe = /(https?:\/\/){0,1}((?:[\w\d-]+\.)+[\w\d]{2,})/i ; // taken example from https://stackoverflow.com/questions/6238351/fastest-way-to-detect-external-urls
const data = domainRe . exec ( url ) ;
const splittedDomain = data [ 2 ] . split ( "." ) ;
if ( splittedDomain . length === 2 ) {
// check extra subdomain is present or not
return data [ 2 ] ;
}
return (
splittedDomain [ splittedDomain . length - 2 ] +
"." +
splittedDomain [ splittedDomain . length - 1 ]
) ; // return only main domain address
}
let lastCompanyName = null ;
$ ( "tr" ) . each ( ( i , tr ) => {
const $tr = $ ( tr ) ;
if ( i === 0 ) {
// Assign an ID to the table.
$tr . closest ( "table" ) . attr ( "id" , "companies-table" ) ;
// Skip the table header row.
return ;
}
const $td = $tr . children ( "td" ) ;
const websiteUrl = $td . eq ( 1 ) . text ( ) ;
const websiteText = websiteUrl
. replace ( /^https?:\/\// , "" )
. replace ( /^www\./ , "" )
. replace ( /\/$/ , "" ) ;
const readmeEntry = {
// Strip out warning emoji indicating that this profile is incomplete
name : $td
. eq ( 0 )
. text ( )
. replace ( /\u26a0/ , "" )
. trim ( ) ,
// Detect warning emoji next to company name
isIncomplete : /\u26a0/ . test ( $td . eq ( 0 ) . text ( ) ) ,
websiteUrl ,
websiteText ,
shortRegion : $td . eq ( 2 ) . text ( ) . trim ( ) ,
} ;
if ( ! websiteText ) {
readmeError ( "Missing website for company: %s" , readmeEntry . name ) ;
}
if ( readmeEntry . name ) {
if ( companyNamesSeen [ readmeEntry . name . toLowerCase ( ) ] ) {
readmeError ( "Duplicate company: %s" , readmeEntry . name ) ;
}
companyNamesSeen [ readmeEntry . name . toLowerCase ( ) ] = true ;
} else {
readmeError ( "Missing company name: %s" , $tr . html ( ) . replace ( /\n/g , "" ) ) ;
}
if (
$td . eq ( 1 ) . children ( ) . length !== 1 ||
! $td . eq ( 1 ) . children ( ) . eq ( 0 ) . is ( "a" )
) {
readmeError (
"Invalid content in Website column: %s" ,
$tr . html ( ) . replace ( /\n/g , "" )
) ;
}
if ( $td . eq ( 2 ) . children ( ) . length > 0 ) {
readmeError (
"Extra content in Region column: %s" ,
$tr . html ( ) . replace ( /\n/g , "" )
) ;
}
if (
lastCompanyName &&
readmeEntry . name . toLowerCase ( ) < lastCompanyName . toLowerCase ( )
) {
readmeError (
'Company is listed out of order: "%s" (should be before "%s")' ,
readmeEntry . name ,
lastCompanyName
) ;
}
lastCompanyName = readmeEntry . name ;
const $profileLink = $td . eq ( 0 ) . find ( "a" ) ;
if ( $profileLink . length === 1 ) {
const match = $profileLink
. attr ( "href" )
. match ( /^\/company-profiles\/(.*\.md)$/ ) ;
if ( match ) {
readmeEntry . linkedFilename = match [ 1 ] ;
if ( profileFilenames . indexOf ( readmeEntry . linkedFilename ) === - 1 ) {
readmeError (
'Missing company profile for "%s", or broken link: "%s"' ,
readmeEntry . name ,
$profileLink . attr ( "href" )
) ;
}
const nameCheck = $profileLink . text ( ) . trim ( ) ;
if ( nameCheck !== readmeEntry . name ) {
readmeError (
"Extra text in company name: %s, %s" ,
jsonStringifyUnicodeEscaped ( nameCheck ) ,
jsonStringifyUnicodeEscaped ( readmeEntry . name )
) ;
}
} else {
readmeError (
'Invalid link to company profile for "%s": "%s"' ,
readmeEntry . name ,
$profileLink . attr ( "href" )
) ;
}
} else {
readmeError (
'Company "%s" has no linked Markdown profile ("/company-profiles/%s.md")' ,
readmeEntry . name ,
companyNameToProfileFilename ( readmeEntry . name )
) ;
}
// Set identifying attributes on table elements
$tr . attr ( "class" , "company-row" ) . attr ( "id" , "company-row-" + ( i - 1 ) ) ;
$td . eq ( 0 ) . attr ( "class" , "company-name" ) ;
$td . eq ( 1 ) . attr ( "class" , "company-website" ) ;
$td . eq ( 2 ) . attr ( "class" , "company-region" ) ;
// Rewrite company profile link to the correct URL for the static site
if ( $profileLink . length ) {
$profileLink . attr (
"href" ,
$profileLink
. attr ( "href" )
. replace ( /^\/company-profiles\// , "/" )
. replace ( /\.md$/ , "/" )
) ;
}
// Rewrite external website link (target="_blank" etc, shorter text)
const $websiteLink = $td . eq ( 1 ) . children ( ) . eq ( 0 ) ;
$websiteLink
. attr ( "target" , "_blank" )
. attr ( "rel" , "noopener noreferrer" )
. text ( websiteText ) ;
readmeCompanies . push ( readmeEntry ) ;
} ) ;
const readmeContent = $ ( "body" ) . html ( ) ;
// Scan the individual Markdown files containing the company profiles.
const allProfileHeadings = { } ;
profileFilenames . forEach ( ( filename ) => {
function profileError ( msg , ... params ) {
error ( filename , msg , ... params ) ;
}
const profileMarkdown = stripExtraChars (
fs . readFileSync ( path . join ( profilesPath , filename ) , "utf8" )
) ;
const $ = cheerio . load ( marked . parse ( profileMarkdown ) ) ;
let hasTitleError = false ;
if ( $ ( "h1" ) . length !== 1 ) {
profileError (
"Expected 1 first-level heading but found %d" ,
$ ( "h1" ) . length
) ;
hasTitleError = true ;
}
if ( ! $ ( "h1" ) . parent ( ) . is ( "body" ) ) {
profileError ( "The main title is wrapped inside of another element." ) ;
}
const companyName = $ ( "h1" ) . text ( ) ;
if ( ! /[a-z]/i . test ( companyName ) ) {
profileError ( 'Company name looks wrong: "%s"' , companyName ) ;
hasTitleError = true ;
}
const filenameBase = filename . replace ( /\.md$/ , "" ) ;
const filenameExpected = companyNameToProfileFilename ( companyName ) ;
if (
! hasTitleError &&
filenameBase !== filenameExpected &&
// Some profile files just have shorter names than the company name,
// which is fine.
filenameExpected . substring ( 0 , filenameBase . length + 1 ) !==
filenameBase + "-"
) {
profileError (
'Company title "%s" doesn\'t match filename (expected ~ "%s.md")' ,
companyName ,
filenameExpected
) ;
}
const readmeEntry = readmeCompanies . find (
( readmeEntry ) => readmeEntry . linkedFilename === filename
) ;
if ( filename !== "example.md" && ! readmeEntry ) {
profileError ( "No link to company profile from readme" ) ;
}
// Build and validate list of headings contained in this Markdown profile.
const profileHeadings = [ ] ;
$ ( "h2" ) . each ( ( i , el ) => {
const headingName = $ ( el ) . html ( ) ;
if ( ! $ ( el ) . parent ( ) . is ( "body" ) ) {
profileError (
'The section heading for "%s" is wrapped inside of another element.' ,
headingName
) ;
}
if ( profileHeadings . indexOf ( headingName ) >= 0 ) {
profileError ( 'Duplicate section: "%s".' , headingName ) ;
} else {
// Track headings for this profile
profileHeadings . push ( headingName ) ;
// Track heading counts across all profiles
if ( ! allProfileHeadings [ headingName ] ) {
allProfileHeadings [ headingName ] = [ ] ;
}
allProfileHeadings [ headingName ] . push ( filename ) ;
}
if ( headingsAll . indexOf ( headingName ) === - 1 ) {
profileError (
'Invalid section: "%s". Expected one of: %s' ,
headingName ,
JSON . stringify ( headingsAll )
) ;
}
} ) ;
headingsRequired . forEach ( ( headingName ) => {
if ( profileHeadings . indexOf ( headingName ) === - 1 ) {
profileError ( 'Required section "%s" not found.' , headingName ) ;
}
} ) ;
// Build and validate the content of each section in this profile.
const profileContent = { } ;
if ( readmeEntry ) {
readmeEntry . profileContent = profileContent ;
}
let currentHeading = null ;
$ ( "body" )
. children ( )
. each ( ( i , el ) => {
const $el = $ ( el ) ;
if ( $el . is ( "h1" ) ) {
return ;
}
if ( $el . is ( "h2" ) ) {
currentHeading = $el . html ( ) ;
profileContent [ currentHeading ] = "" ;
} else if ( currentHeading ) {
// Note: This assumes that the only possible children of the
// 'body' are block-level elements. I think this is correct,
// because from what I've seen, any inline content is wrapped
// in a <p>.
addTargetBlankAndExternalLinkIcons ( el ) ;
profileContent [ currentHeading ] = (
profileContent [ currentHeading ] +
"\n\n" +
$ . html ( el )
) . trim ( ) ;
} else {
profileError (
"Content is not part of any section: %s" ,
$ . html ( el ) . replace ( /\n/g , "" )
) ;
}
} ) ;
Object . keys ( profileContent ) . forEach ( ( heading ) => {
const sectionText = profileContent [ heading ]
. replace ( /<[^>]+>/g , "" )
. trim ( ) ;
if ( ! sectionText ) {
profileError (
'Empty section: "%s". Fill it in or leave it out instead.' ,
heading
) ;
}
} ) ;
// Rewrite profile content to use more code-friendly heading names.
Object . keys ( profileContent ) . forEach ( ( headingName ) => {
const headingIdentifier = toIdentifierCase ( headingName ) ;
profileContent [ headingIdentifier ] = profileContent [ headingName ] ;
delete profileContent [ headingName ] ;
} ) ;
if ( readmeEntry && profileContent . companyBlurb ) {
// Check for company profiles that were filled in, but the "incomplete"
// mark was left in the readme, or vice versa.
const isIncomplete = {
readme : readmeEntry . isIncomplete ,
sections :
profileHeadings . length === 1 &&
profileHeadings [ 0 ] === "Company blurb" ,
content : /⚠/ . test ( profileContent . companyBlurb ) ,
} ;
const incompleteCount = Object . values ( isIncomplete ) . reduce (
( sum , v ) => sum + ( v ? 1 : 0 ) ,
0
) ;
// incompleteCount === 0: Profile is incomplete; all 3 indicators are consistent
// incompleteCount === 3: Profile is "complete"; all 3 indicators are consistent
if ( incompleteCount === 1 ) {
if ( isIncomplete . readme ) {
profileError (
"Profile looks complete, but the main readme contains a warning emoji."
) ;
} else if ( isIncomplete . sections ) {
profileError (
'Profile is marked as complete, but it only contains a "Company blurb" heading.'
) ;
} else {
// isIncomplete.content
profileError (
'Profile looks complete, but the "Company blurb" contains a warning emoji.'
) ;
}
} else if ( incompleteCount === 2 ) {
if ( ! isIncomplete . readme ) {
profileError (
"Profile looks incomplete, but the main readme does not contain a warning emoji."
) ;
} else if ( ! isIncomplete . sections ) {
profileError (
"Profile is marked as incomplete, but it contains multiple sections." +
'\nPlease remove the warning emoji from the "Company blurb" section and the main readme.'
) ;
} else {
// ! isIncomplete.content
profileError (
'Profile looks incomplete, but the "Company blurb" does not contain a warning emoji.'
) ;
}
}
}
} ) ;
const profileHeadingCounts = { } ;
Object . keys ( allProfileHeadings ) . forEach ( ( heading ) => {
profileHeadingCounts [ heading ] = allProfileHeadings [ heading ] . length ;
} ) ;
if ( errors . length > 0 ) {
return {
ok : false ,
errors ,
profileFilenames ,
profileHeadingCounts ,
} ;
}
return {
ok : true ,
profileFilenames ,
profileHeadingCounts ,
companies : readmeCompanies ,
readmeContent ,
} ;
2018-06-25 05:00:12 +00:00
} ;
2020-05-06 06:42:21 +00:00
/ * *
* Build search index data from the result of parseFromDirectory ( ) .
* /
2022-02-05 01:17:01 +00:00
exports . buildSearchData = ( data ) => {
const textData = [ ] ;
data . companies . forEach ( ( company , i ) => {
const thisTextData = {
id : String ( i ) ,
nameText : company . name ,
websiteText : company . websiteText ,
} ;
if ( company . shortRegion ) {
thisTextData . shortRegion = company . shortRegion ;
}
Object . keys ( exports . headingPropertyNames ) . forEach ( ( h ) => {
if ( company . profileContent [ h ] ) {
const text = cheerio
. load ( company . profileContent [ h ] )
. text ( )
// Replace warning emoji with a searchable token
. replace ( /\u26a0/ , "(_incomplete)" ) ;
thisTextData [ h ] = text ;
}
} ) ;
textData . push ( thisTextData ) ;
} ) ;
const index = lunr ( function ( ) {
this . field ( "nameText" ) ;
this . field ( "websiteText" ) ;
this . field ( "shortRegion" ) ;
Object . keys ( exports . headingPropertyNames ) . forEach ( ( h ) => {
this . field ( h ) ;
} ) ;
// https://github.com/olivernn/lunr.js/issues/25#issuecomment-623267494
this . metadataWhitelist = [ "position" ] ;
// https://github.com/olivernn/lunr.js/issues/192#issuecomment-172915226
// https://gist.github.com/olivernn/7cd496f8654a0246c53c
function contractionTrimmer ( token ) {
return token . update ( ( str ) => {
return str . replace ( /('m|'ve|n't|'d|'ll|'ve|'s|'re)$/ , "" ) ;
} ) ;
}
lunr . Pipeline . registerFunction ( contractionTrimmer , "contractionTrimmer" ) ;
this . pipeline . after ( lunr . trimmer , contractionTrimmer ) ;
Object . keys ( textData ) . forEach ( ( c ) => this . add ( textData [ c ] ) ) ;
} ) ;
const headings = getHeadingPropertyNames ( ) ;
headings . nameText = "Company name" ;
headings . websiteText = "Website" ;
headings . shortRegion = "Region" ;
return { index , textData , headings } ;
2020-05-06 06:42:21 +00:00
} ;