0
私は問題を解決するためのアイデアを考えるのに少し問題があります。 私はハッシュテーブルを使用して任意の数のファイルのすべての単語を と数え、すべてのファイルに含まれる単語 とその数だけを出力するプログラムを持っています。私はまた、すべての私の使用した ハッシュインデックスをリンクリストに格納します。HashTableトップ20カウントのアイデア
私自身の問題を解決したので、答えは簡単だろうと分かっていました。私はちょうど最も低い数のものを見つけたばかりで、もし私の新しい値がそれよりも大きければ、それは20ワードの構造体の配列中の最低のもののインデックスにあります。
ご協力いただきありがとうございます。
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <pthread.h>
#include <string.h>
/*Structures*///////////////////////////////////////////
//A struct to hold the words in the hash tables and their
//counts
struct counter{
int count;
int printed;
char word[51];
int allfiles[101];
struct counter * next;
};
//A struct to hold the hash indexes of the already visited
//index, for easy printing
struct index{
int used;
struct index * next;
};
//A simple struct to pass arguments to the work function for
//threading
struct arguments{
void * id;
int fileid;
};
////////////////////////////////////////////////////////
/*Functions*////////////////////////////////////////////
static int hash(char * word);
static void inHash(struct counter * newWord, int hash, int FILEID);
static void indexchain(int hash);
//static void hashprint(int NUMFILES);
static void * work(struct arguments *);
static void toptwenty(int NUMFILES);
static void print();
////////////////////////////////////////////////////////
/*Global Variables*/////////////////////////////////////
struct counter * top[20] = {0};
struct counter * hashtable[6222] = {0};
struct index * head;
////////////////////////////////////////////////////////
int main(int argc, char * argv[])
{
//check for valid number of arguments
if(argc < 2)
{
fprintf(stderr, "invalid number of arguments");
return 1;
}
//set up index chain starts with a null node
head = malloc(sizeof(struct index));
head->next = NULL;
head->used = -1;
//loop through any number of files
int arg;
for(arg = 1; arg < argc; arg++)
{
struct arguments * argum = malloc(sizeof(struct arguments));
argum->fileid = arg;
argum->id = ((void*)argv[arg]);
work(argum);
}
//hashprint(argc);
toptwenty(argc);
print();
return 0;
}
/*Function Definitions*/
//this function takes a file name and counts
//the words in the file
static void * work(struct arguments * argum)
{
int FILEID = argum->fileid;
void * in = argum->id;
int fd = open((char*)in, O_RDONLY);
if (fd == -1)
{
fprintf(stderr, "can't open %s for reading!\n", (char*)in);
exit(-1);
}
int BUFLEN = (int) lseek(fd, 0, SEEK_END);
lseek(fd, 0, 0);
//A few variable
char buf[BUFLEN + 1];
int lastRead;
lastRead = read(fd, buf, BUFLEN);
if (lastRead == -1)
{
fprintf(stderr, "error reading file %s!\n", (char*)in);
exit(-1);
}
//Parse the filebuffer for words.
char newword[51];
int c;
int curindex = 0;
buf[BUFLEN + 1] = ' ';
//not doing the last space because it is eof
for(c = 0; c < BUFLEN + 1; c++)
{
if((buf[c] >= 'A' && buf[c] <= 'Z'))
{
buf[c] += 32;
}
if(buf[c] >= 'a' && buf[c] <= 'z')
{
//add the next char to the string.
newword[curindex] = buf[c];
curindex++;
}
else
{
//make a new struct for the entry, and add it to the hashtable
//add its hash to the
if(strlen(newword) >= 6)
{
struct counter * temp = malloc(sizeof(struct counter));
strcpy(temp->word,newword);
int thishash = hash(temp->word);
//Only save hash indexes if they are in the first files
if(FILEID == 1)
{
indexchain(thishash);
}
inHash(temp, thishash, FILEID);
}
int wordlength = strlen(newword);
int i;
for(i = 0;i < wordlength; i++)
{
newword[i] = 0;
}
curindex = 0;
}
}
close(fd);
return in;
}
//Bad hash function by just adding ascii values of the
//characters
static int hash(char * word)
{
int loop = strlen(word);
int i;
int hashval = 0;
for(i = 0; i < loop; i++)
hashval += word[i];
return hashval;
}
//add a new word to the hash table
static void inHash(struct counter * newWord, int hash, int FILEID)
{
int eflag = 0;
if(hashtable[hash] == NULL)
{
//if the entry isnt in the table
if(FILEID == 1)
{
newWord->allfiles[FILEID] = 1; /*FILEID ARRAY TEST*/
newWord->count = 1;
newWord->next = NULL;
hashtable[hash] = newWord;
}
}
else
{
//if its not, but what if it is?
struct counter * cur = hashtable[hash];
if(strcmp(cur->word, newWord->word) == 0)
{
//is the word in the first slot?
cur->count += 1;
cur->allfiles[FILEID] = 1; /*FILEID ARRAY TEST*/
eflag = 1;
}
else
{
while(cur->next != NULL)
{
cur = cur->next;
if(strcmp(cur->word, newWord->word) == 0)
{
//if the word already exsists, update the count
cur->allfiles[FILEID] = 1; /*FILEID ARRAY TEST*/
cur->count += 1;
eflag = 1;
break;
}
}
}
//if its not in any bucket, make a new bucket
if(eflag == 0)
{
//Else add the new entry to the end of that list
if(FILEID == 1)
{
newWord->allfiles[FILEID] = 1; /*FILEID ARRAY TEST*/
newWord->count = 1;
newWord->next = NULL;
cur->next = newWord;
}
}
}
}
//adding a value to the linked list for printing
static void indexchain(int hash)
{
struct index * p = head;
int eflag = 0;
while(p->next != NULL)
{
if(p->used != hash)
p = p->next;
else
{
eflag = 1;
break;
}
}
if(eflag == 0)
{
struct index * newValue = malloc(sizeof(struct index));
newValue->used = hash;
newValue->next = NULL;
p->next = newValue;
}
}
/*
//This function will print the values in the hash tables and their counts
//Prints based on number of files to check if words are in all files
static void hashprint(int NUMFILES)
{
struct index * p;
p = head->next;
int hash;
int i;
int printbool = 1;
while(p != NULL)
{
hash = p->used;
struct counter * ptr = hashtable[hash];
while(ptr != NULL)
{
if(ptr->printed == 0)
{
for(i = 1; i < NUMFILES; i++)
{
if(ptr->allfiles[i] == 0)
{
printbool = 0;
break;
}
else
printbool = 1;
}
if(printbool == 1)
{
ptr->printed = 1;
printf("%s %d\n", ptr->word, ptr->count);
}
}
ptr = ptr->next;
}
p = p->next;
}
}
*/
//A function to see which numbers have the top twenty highest count
static void toptwenty(int NUMFILES)
{
struct index * p;
p = head->next;
int hash;
int i;
int printbool = 1;
while(p != NULL)
{
hash = p->used;
struct counter * ptr = hashtable[hash];
while(ptr != NULL)
{
if(ptr->printed == 0)
{
for(i = 1; i < NUMFILES; i++)
{
if(ptr->allfiles[i] == 0)
{
printbool = 0;
break;
}
else
printbool = 1;
}
if(printbool == 1)
{
for(i = 0; i < 20; i++)
{
if(top[i] == NULL)
{
top[i] = ptr;
break;
}
else if(ptr->count > top[i]->count)
{
top[i] = ptr;
break;
}
}
}
}
ptr = ptr->next;
}
p = p->next;
}
}
//print the top 20 count
static void print()
{
int i;
for(i = 0; i < 20; i++)
{
if(top[i] != NULL)
{
if(top[i]->printed == 0)
{
//printf("%s\n", top[i]->word);
printf("%s %d\n", top[i]->word, top[i]->count);
top[i]->printed = 1;
}
}
else
break;
}
}
解析する必要があるテキストの種類についてさらに詳しく説明できますか。またどれくらいそれがあるでしょうか。 – Azrael3000
これまで持っていたコードを投稿しますか?単語がどのファイルに含まれているかをどのように記録しますか? – gbulmer