Now that I finished my last post about counting keywords in source code, I will give away the little script I did to help me with the counting. It definitively requires some improvements to cover some extra cases I did not consider, but it work just fine if you manage to avoid them... or even better, implement them yourself :D
1. Multi-line comments are not supported. If there are reserved words within the comments, they will be counted as keywords. But worry not; single line comments work fine.
2. Text literals are not supported. If there are reserved words within the string, they will be counted as keywords.
I think regular expressions can handle those 2 cases. I'll add those two features if I need them in the future.
The programming language used was Python version 3.1.3
I formatted some parts of the code to correctly display it using the size limit of this blog's page. For instance, if the script doesn't compile; try using one line comma separated lists and if statements.
I think the code is simple and very easy to read... that's what all programmers say about their own code, right? Anyway, here below, I will begin by showing, function by function, the code of the script including lots of unnecessary comments and some screen shots that help explain what it does. Next, I will show a complete example of how to use it and the output.
The complete script without comments it's available at the bottom of this post.
Dissecting the Script
# import from sys to handle script's (command line) parameters
import sys
# get_line_comment_char function returns the Language's single
# line character(s)
# only one "single comments character" is taken in consideration
# even if the language supports multiple single line comments
# I grouped languages using the same comment character to reduce
# the number of ifs
def get_line_comment_char(language):
language = language.lower()
if language in ('c#', 'csharp', 'cpp', 'cppcli', 'c++', 'c++cli', 'f#',
'fsharp', 'boo', 'phalanger', 'php', 'delphiprism', 'delphi',
'nemerle', 'groovy', 'java', 'fantom', 'fan', 'jscript',
'jscriptnet', 'scala', 'javafx', 'javafxscript', 'gosu'):
return '//'
elif language in ('vb', 'vbnet', 'visualbasic'):
return '\''
elif language in ('jython', 'ironpython', 'python', 'jruby', 'ironruby',
'ruby', 'cobra'):
return '#'
elif language in ('zonnon'):
return '(*'
return None
Example:
# get_language_keywords function returns a list of all the
# keywords (reserved words) for the given language
# except for the primitive types since I didn't consider them
# as keywords for my comparison purposes
# if you need them, you can take the complete keywords lists
# this blog's Keywords page
# I removed the lists to save space because it makes the program
# too long and less readable, so, in the function below,
# for every ocurrence of keywords = '' you change '' with the list
# of keywords of that language.
def get_language_keywords(language):
language = language.lower()
if language == 'c#' or language == 'csharp':
keywords = ''
elif language == 'vb' or language == 'vbnet'
or language == 'visualbasic':
keywords = ''
elif language == 'cpp' or language == 'cppcli'
or language == 'c++' or language == 'c++cli':
keywords = ''
elif language == 'f#' or language == 'fsharp':
keywords = ''
elif language == "boo":
keywords = ''
elif language == 'phalanger' or language == 'php':
keywords = ''
elif language == 'jython' or language == 'ironpython'
or language == 'python':
keywords = ''
elif language == 'jruby' or language == 'ironruby'
or language == 'ruby':
keywords = ''
elif language == 'delphiprism' or language == 'delphi':
keywords = ''
elif language == 'zonnon':
keywords = ''
elif language == 'nemerle':
keywords = ''
elif language == 'groovy':
keywords = ''
elif language == 'java':
keywords = ''
elif language == 'cobra':
keywords = ''
elif language == 'fantom' or language == 'fan':
keywords = ''
elif language == 'jscript' or language == 'jscriptnet':
keywords = ''
elif language == 'scala':
keywords = ''
elif language == 'javafx' or language == 'javafxscript':
keywords = ''
elif language == 'gosu':
keywords = ''
else:
keywords = ''
return keywords.split()
Example:
# is_keyword function returns true if the given keyword is a
# valid keyword in the given language
def is_keyword(key, language_keys):
if key in language_keys:
return True
return False
Example:
# remove_special_characters function removes special characters
# from the given text and returns the resulting sequence
# special characters includes open and closing blocks,
# operators, and so on (see the list below).
def remove_special_characters(text):
special_characters = ('(','[','{','}',']',')','+','-','*','/','=','^','&',
'%','$','#','@','!','~','\'','\"', '?', '>', '<', ':',
';', ',', '.')
for character in special_characters:
text = text.replace(character, ' ')
return text
Example:
# remove_duplicates_in_list function removes any duplicate value
# found in the given sequence and returns the resulting sequence
def remove_duplicates_in_list(sequence):
# Thanks to Dave Kirby for this function taken from comments in
# http://www.peterbe.com/plog/uniqifiers-benchmark
seen = set()
seen_add = seen.add
return [x for x in sequence if x not in seen and not seen_add(x)]
Example:
# count_ocurrences_in_list function returns a new list of paired values
# that contain the keyword and how many times it appeared in the code
# using the standard count method of the sequences types
def count_ocurrences_in_list(sequence):
totals_by_word = []
no_duplicates = remove_duplicates_in_list(sequence)
for word in no_duplicates:
totals_by_word.append((word,sequence.count(word)))
return totals_by_word
Example:
# print_results function prints the given sequence to the console
# It displays the list of Keyword and Total Number of Keywords
def print_results(sequence):
print('', end='\n')
for item in sequence:
print('%s, %s' % (item[0], item[1]))
Example:
# print_to_file function creates an output file and prints the given
# sequence into it.
# It writes the list of Keyword and Total Number of Keywords
def print_to_file(sequence, output):
with open(output, mode='w', encoding='utf-8') as output_file:
output_file.write('\n')
for item in sequence:
output_file.write('%s, %s\n' % (item[0], item[1]))
Example:
Output:
Opening CSV:
# Main function. The program's entry point.
def main(argv=None):
# if command line arguments are not empty
if argv is None:
# assign them to the local sequence variable argv
argv = sys.argv
# validate the script was used correctly.
if len(argv) != 4:
# Otherwise, print the usage message to the console
print("usage: program language sourcefile outputfile")
# and exit the script
sys.exit()
Example:
# get command line arguments
# parameter 2 should be the language of the source code you want to count
programming_language = sys.argv[1]
# parameter 3 should be the file that contains the source code you want to count
source_file = sys.argv[2]
# parameter 4 should be the file that will contain the counting results
output_file = sys.argv[3]
# get the list of reserved words for the given programming language
language_keywords = get_language_keywords(programming_language)
# get the single line comment character for the given programming language
comment_character = get_line_comment_char(programming_language)
Example:
Here I added some print() and exit() to stop execution of the script
# define a variable that will hold the total number of keywords found in the file
total_keywords_found = 0
# define a list that will store all valid language keywords found in the file
list_of_keywords_found = []
line_number = 0
# open a file as read only (default mode)
with open(source_file, encoding='utf-8') as source_code:
# navigate through the lines of the file
for line in source_code:
# get the line number. Used only for console display purposes
line_number += 1
Example:
The following line adds the line number to the beginning of the line being read
print('{:>4} {}'.format(line_number, line.rstrip()), end='\n')
# remove line comment if there is one based on the
# language's single line comment character
comment_idx = line.find(comment_character)
if comment_idx >= 0:
line = line[0:comment_idx]
Example:
# remove special characters from remaining text in line
# (operators and open-close characters)
line = remove_special_characters(line)
# create a list with all remaining words in the line
words = line.split()
Example:
# navigate through the words of the sequence
for word in words:
# if the current word is a valid keyword for the
# given language
if is_keyword(word, language_keywords):
# add to the list of
# "total keywords found in the source code" the word
list_of_keywords_found.append(word)
# increment the number of total words found
# in the source code
total_keywords_found = total_keywords_found + 1
Example:
# define a list that will store all paired
# values (keyword and total ocurrences of that keyword)
totals_to_output = []
# insert first pair of values as the first element of the list,
# that is the Programming Language and the Total Keywords found
totals_to_output.append((programming_language.capitalize(),str(total_keywords_found)))
# insert all other pairs of values (keyword and total ocurrences
# of that keyword) found on the source code
totals_to_output.extend(count_ocurrences_in_list(list_of_keywords_found))
Example:
# print the results to the console
print_results(totals_to_output)
# print the results to the output file
print_to_file(totals_to_output, output_file)
# run the program
if __name__ == "__main__":
main()
Example:
Using the Script
How to use is was already shown in the step by step example above, but here below I just show you how I did the counting for my previous post "Keywords in source code Round 2"
First I created 2 folders:
1. SrcCode
2. OutCode
Then, I added all the source code files I wanted to count from into the SrcCode folder.
Next step was to create a batch file that runs the python script for all the files in the SrcCode folder.
And ran it
Console Output
Files Output in folder OutCode
Et VoilĂ ! I then started merging together the results and did the graphs.
Hope this script works for you or at least shows you some cool python features :)
Final Script
Just don't forget to add the language keywords! take them from the Keywords Page.
Example:
keywords = 'public private static internal for if else while switch'
import sys
def get_line_comment_char(language):
language = language.lower()
if language in ('c#', 'csharp', 'cpp', 'cppcli', 'c++', 'c++cli', 'f#',
'fsharp', 'boo', 'phalanger', 'php', 'delphiprism', 'delphi',
'nemerle', 'groovy', 'java', 'fantom', 'fan', 'jscript',
'jscriptnet', 'scala', 'javafx', 'javafxscript', 'gosu'):
return '//'
elif language in ('vb', 'vbnet', 'visualbasic'):
return '\''
elif language in ('jython', 'ironpython', 'python', 'jruby', 'ironruby',
'ruby', 'cobra'):
return '#'
elif language in ('zonnon'):
return '(*'
return None
def get_language_keywords(language):
language = language.lower()
if language == 'c#' or language == 'csharp':
keywords = ''
elif language == 'vb' or language == 'vbnet'
or language == 'visualbasic':
keywords = ''
elif language == 'cpp' or language == 'cppcli'
or language == 'c++' or language == 'c++cli':
keywords = ''
elif language == "f#" or language == "fsharp":
keywords = ''
elif language == "boo":
keywords = ''
elif language == 'phalanger' or language == 'php':
keywords = ''
elif language == 'jython' or language == 'ironpython'
or language == 'python':
keywords = ''
elif language == 'jruby' or language == 'ironruby'
or language == 'ruby':
keywords = ''
elif language == 'delphiprism' or language == 'delphi':
keywords = ''
elif language == 'zonnon':
keywords = ''
elif language == 'nemerle':
keywords = ''
elif language == 'groovy':
keywords = ''
elif language == 'java':
keywords = ''
elif language == 'cobra':
keywords = ''
elif language == 'fantom' or language == 'fan':
keywords = ''
elif language == 'jscript' or language == 'jscriptnet':
keywords = ''
elif language == 'scala':
keywords = ''
elif language == 'javafx' or language == 'javafxscript':
keywords = ''
elif language == 'gosu':
keywords = ''
else:
keywords = ''
return keywords.split()
def is_keyword(key, language_keys):
if key in language_keys:
return True
return False
def remove_special_characters(text):
special_characters = ('(','[','{','}',']',')','+','-','*','/','=','^','&',
'%','$','#','@','!','~','\'','\"', '?', '>', '<', ':',
';', ',', '.')
for character in special_characters:
text = text.replace(character, ' ')
return text
def remove_duplicates_in_list(sequence):
seen = set()
seen_add = seen.add
return [x for x in sequence if x not in seen and not seen_add(x)]
def count_ocurrences_in_list(sequence):
totals_by_word = []
no_duplicates = remove_duplicates_in_list(sequence)
for word in no_duplicates:
totals_by_word.append((word,sequence.count(word)))
return totals_by_word
def print_results(sequence):
print('', end='\n')
for item in sequence:
print('%s, %s' % (item[0], item[1]))
def print_to_file(sequence, output):
with open(output, mode='w', encoding='utf-8') as output_file:
output_file.write('\n')
for item in sequence:
output_file.write('%s, %s\n' % (item[0], item[1]))
def main(argv=None):
if argv is None:
argv = sys.argv
if len(argv) != 4:
print("usage: program language sourcefile outputfile")
sys.exit()
programming_language = sys.argv[1]
source_file = sys.argv[2]
output_file = sys.argv[3]
language_keywords = get_language_keywords(programming_language)
comment_character = get_line_comment_char(programming_language)
total_keywords_found = 0
list_of_keywords_found = []
line_number = 0
with open(source_file, encoding='utf-8') as source_code:
for line in source_code:
line_number += 1
comment_idx = line.find(comment_character)
if comment_idx >= 0:
line = line[0:comment_idx]
line = remove_special_characters(line)
words = line.split()
for word in words:
if is_keyword(word, language_keywords):
list_of_keywords_found.append(word)
total_keywords_found = total_keywords_found + 1
totals_to_output = []
totals_to_output.append((programming_language.capitalize(),str(total_keywords_found)))
totals_to_output.extend(count_ocurrences_in_list(list_of_keywords_found))
print_results(totals_to_output)
print_to_file(totals_to_output, output_file)
if __name__ == "__main__":
main()
Thaank you for writing this
ReplyDelete